From 8222dc03c219717a0e6fc3c584d0b0da5048e183 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Fri, 5 Jan 2024 13:25:27 +0100 Subject: [PATCH 01/39] add try except around datasets to train on broken datasets --- doctr/datasets/datasets/base.py | 54 +++++++++++++++++++-------------- doctr/datasets/detection.py | 16 +++++++--- doctr/datasets/recognition.py | 13 ++++++-- 3 files changed, 53 insertions(+), 30 deletions(-) diff --git a/doctr/datasets/datasets/base.py b/doctr/datasets/datasets/base.py index 58f1ca29f..7ef924ffd 100644 --- a/doctr/datasets/datasets/base.py +++ b/doctr/datasets/datasets/base.py @@ -5,6 +5,7 @@ import os import shutil +import traceback from pathlib import Path from typing import Any, Callable, List, Optional, Tuple, Union @@ -46,28 +47,37 @@ def _read_sample(self, index: int) -> Tuple[Any, Any]: def __getitem__(self, index: int) -> Tuple[Any, Any]: # Read image - img, target = self._read_sample(index) - # Pre-transforms (format conversion at run-time etc.) - if self._pre_transforms is not None: - img, target = self._pre_transforms(img, target) - - if self.img_transforms is not None: - # typing issue cf. https://github.com/python/mypy/issues/5485 - img = self.img_transforms(img) - - if self.sample_transforms is not None: - # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks. - if ( - isinstance(target, dict) - and all(isinstance(item, np.ndarray) for item in target.values()) - and set(target.keys()) != {"boxes", "labels"} # avoid confusion with obj detection target - ): - img_transformed = _copy_tensor(img) - for class_name, bboxes in target.items(): - img_transformed, target[class_name] = self.sample_transforms(img, bboxes) - img = img_transformed - else: - img, target = self.sample_transforms(img, target) + try: + img, target = self._read_sample(index) + # Pre-transforms (format conversion at run-time etc.) + if self._pre_transforms is not None: + img, target = self._pre_transforms(img, target) + + if self.img_transforms is not None: + # typing issue cf. https://github.com/python/mypy/issues/5485 + img = self.img_transforms(img) + + if self.sample_transforms is not None: + # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks. + if ( + isinstance(target, dict) + and all(isinstance(item, np.ndarray) for item in target.values()) + and set(target.keys()) != {"boxes", "labels"} # avoid confusion with obj detection target + ): + img_transformed = _copy_tensor(img) + for class_name, bboxes in target.items(): + img_transformed, target[class_name] = self.sample_transforms(img, bboxes) + img = img_transformed + else: + img, target = self.sample_transforms(img, target) + except Exception: + img_name = self.data[index][0] + # Write + print() + print(f"!!!ERROR in Dataset on filename {img_name}") + traceback.print_exc() + print() + return self.__getitem__(0) # should exists ^^ return img, target diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py index 0000704df..0e9c0bbf7 100644 --- a/doctr/datasets/detection.py +++ b/doctr/datasets/detection.py @@ -55,14 +55,20 @@ def __init__( self.data: List[Tuple[str, Tuple[np.ndarray, List[str]]]] = [] np_dtype = np.float32 + missing_files = [] for img_name, label in labels.items(): # File existence check if not os.path.exists(os.path.join(self.root, img_name)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") - - geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype) - - self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + missing_files.append(img_name) + # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") + else: + geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype) + self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + print("List of missing files:") + print(f"MISSING FILES: {len(missing_files)}") + from pprint import pprint + + pprint(missing_files) def format_polygons( self, polygons: Union[List, Dict], use_polygons: bool, np_dtype: Type diff --git a/doctr/datasets/recognition.py b/doctr/datasets/recognition.py index ebf37a20a..381776138 100644 --- a/doctr/datasets/recognition.py +++ b/doctr/datasets/recognition.py @@ -40,11 +40,18 @@ def __init__( with open(labels_path, encoding="utf-8") as f: labels = json.load(f) + missing_files = [] for img_name, label in labels.items(): if not os.path.exists(os.path.join(self.root, img_name)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") - - self.data.append((img_name, label)) + missing_files.append(img_name) + # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") + else: + self.data.append((img_name, label)) + print("List of missing files:") + print(f"MISSING FILES: {len(missing_files)}") + from pprint import pprint + + pprint(missing_files) def merge_dataset(self, ds: AbstractDataset) -> None: # Update data with new root for self From e6faaf6f1d30b12ba751a2075bf70b3cbf315467 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Sat, 6 Jan 2024 16:33:12 +0100 Subject: [PATCH 02/39] fix in collate_fn --- doctr/datasets/datasets/tensorflow.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/doctr/datasets/datasets/tensorflow.py b/doctr/datasets/datasets/tensorflow.py index 86b7b7928..da7890f97 100644 --- a/doctr/datasets/datasets/tensorflow.py +++ b/doctr/datasets/datasets/tensorflow.py @@ -49,10 +49,18 @@ def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: @staticmethod def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]: - images, targets = zip(*samples) + # FIXME + # problems with some shape != 3 + images, targets = [], [] + for sample in samples: + if sample[0].shape[-1] == 3: + images.append(sample[0]) + targets.append(sample[1]) + + # images, targets = zip(*samples) images = tf.stack(images, axis=0) - return images, list(targets) + return images, targets class VisionDataset(AbstractDataset, _VisionDataset): # noqa: D101 From b5a41bdbb37cf4738b3904e0603d0428cba3524a Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Sat, 6 Jan 2024 17:04:21 +0100 Subject: [PATCH 03/39] Problems with augmentations involving _gaussian_filter --- references/detection/train_tensorflow.py | 28 ++++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 05ee7c890..21b27d1be 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -178,9 +178,11 @@ def main(args): with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() - batch_transforms = T.Compose([ - T.Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)), - ]) + batch_transforms = T.Compose( + [ + T.Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)), + ] + ) # Load doctr model model = detection.__dict__[args.arch]( @@ -223,9 +225,9 @@ def main(args): # Augmentations T.RandomApply(T.ColorInversion(), 0.1), T.RandomJpegQuality(60), - T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), - T.RandomApply(T.RandomShadow(), 0.1), - T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1), + #T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), + #T.RandomApply(T.RandomShadow(), 0.1), + #T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1), T.RandomSaturation(0.3), T.RandomContrast(0.3), T.RandomBrightness(0.3), @@ -356,12 +358,14 @@ def main(args): print(log_msg) # W&B if args.wb: - wandb.log({ - "val_loss": val_loss, - "recall": recall, - "precision": precision, - "mean_iou": mean_iou, - }) + wandb.log( + { + "val_loss": val_loss, + "recall": recall, + "precision": precision, + "mean_iou": mean_iou, + } + ) # ClearML if args.clearml: From fbef2cfda880b497a438ab6b0ccd95e21f824adc Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Thu, 1 Feb 2024 23:19:57 +0100 Subject: [PATCH 04/39] from https://github.com/mindee/doctr/pull/1444 --- .../differentiable_binarization/pytorch.py | 38 ++++++++++--------- references/detection/train_pytorch.py | 34 +++++++++-------- references/detection/train_tensorflow.py | 26 +++++++------ 3 files changed, 54 insertions(+), 44 deletions(-) diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py index 9e4b81ef9..17686bb28 100644 --- a/doctr/models/detection/differentiable_binarization/pytorch.py +++ b/doctr/models/detection/differentiable_binarization/pytorch.py @@ -57,24 +57,28 @@ def __init__( conv_layer = DeformConv2d if deform_conv else nn.Conv2d - self.in_branches = nn.ModuleList([ - nn.Sequential( - conv_layer(chans, out_channels, 1, bias=False), - nn.BatchNorm2d(out_channels), - nn.ReLU(inplace=True), - ) - for idx, chans in enumerate(in_channels) - ]) + self.in_branches = nn.ModuleList( + [ + nn.Sequential( + conv_layer(chans, out_channels, 1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True), + ) + for idx, chans in enumerate(in_channels) + ] + ) self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True) - self.out_branches = nn.ModuleList([ - nn.Sequential( - conv_layer(out_channels, out_chans, 3, padding=1, bias=False), - nn.BatchNorm2d(out_chans), - nn.ReLU(inplace=True), - nn.Upsample(scale_factor=2**idx, mode="bilinear", align_corners=True), - ) - for idx, chans in enumerate(in_channels) - ]) + self.out_branches = nn.ModuleList( + [ + nn.Sequential( + conv_layer(out_channels, out_chans, 3, padding=1, bias=False), + nn.BatchNorm2d(out_chans), + nn.ReLU(inplace=True), + nn.Upsample(scale_factor=2**idx, mode="bilinear", align_corners=True), + ) + for idx, chans in enumerate(in_channels) + ] + ) def forward(self, x: List[torch.Tensor]) -> torch.Tensor: if len(x) != len(self.out_branches): diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 4f6401151..e3fe2c9f8 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -266,15 +266,17 @@ def main(args): train_set = DetectionDataset( img_folder=os.path.join(args.train_path, "images"), label_path=os.path.join(args.train_path, "labels.json"), - img_transforms=Compose([ - # Augmentations - T.RandomApply(T.ColorInversion(), 0.1), - T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), - T.RandomApply(T.RandomShadow(), 0.1), - T.RandomApply(GaussianBlur(kernel_size=3), 0.1), - RandomPhotometricDistort(p=0.05), - RandomGrayscale(p=0.05), - ]), + img_transforms=Compose( + [ + # Augmentations + T.RandomApply(T.ColorInversion(), 0.1), + T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), + T.RandomApply(T.RandomShadow(), 0.1), + T.RandomApply(GaussianBlur(kernel_size=3), 0.1), + RandomPhotometricDistort(p=0.05), + RandomGrayscale(p=0.05), + ] + ), sample_transforms=T.SampleCompose( ( [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] @@ -390,12 +392,14 @@ def main(args): print(log_msg) # W&B if args.wb: - wandb.log({ - "val_loss": val_loss, - "recall": recall, - "precision": precision, - "mean_iou": mean_iou, - }) + wandb.log( + { + "val_loss": val_loss, + "recall": recall, + "precision": precision, + "mean_iou": mean_iou, + } + ) if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 21b27d1be..fd0271c18 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -221,18 +221,20 @@ def main(args): train_set = DetectionDataset( img_folder=os.path.join(args.train_path, "images"), label_path=os.path.join(args.train_path, "labels.json"), - img_transforms=T.Compose([ - # Augmentations - T.RandomApply(T.ColorInversion(), 0.1), - T.RandomJpegQuality(60), - #T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), - #T.RandomApply(T.RandomShadow(), 0.1), - #T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1), - T.RandomSaturation(0.3), - T.RandomContrast(0.3), - T.RandomBrightness(0.3), - T.RandomApply(T.ToGray(num_output_channels=3), 0.05), - ]), + img_transforms=T.Compose( + [ + # Augmentations + T.RandomApply(T.ColorInversion(), 0.1), + T.RandomJpegQuality(60), + # T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), + # T.RandomApply(T.RandomShadow(), 0.1), + # T.RandomApply(T.GaussianBlur(kernel_shape=3, std=(0.1, 0.1)), 0.1), + T.RandomSaturation(0.3), + T.RandomContrast(0.3), + T.RandomBrightness(0.3), + T.RandomApply(T.ToGray(num_output_channels=3), 0.05), + ] + ), sample_transforms=T.SampleCompose( ( [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] From c350bd7c4b3b4ede2b9fcbf17567f37a970b3e35 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 6 Feb 2024 13:30:50 +0100 Subject: [PATCH 05/39] send message on slack (pytorch script) --- references/detection/train_pytorch.py | 45 ++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index e3fe2c9f8..d23111f18 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -12,6 +12,7 @@ import logging import multiprocessing as mp import time +from pathlib import Path import numpy as np import psutil @@ -28,6 +29,32 @@ from doctr.utils.metrics import LocalizationConfusion from utils import EarlyStopper, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + def record_lr( model: torch.nn.Module, @@ -106,6 +133,8 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_loader, position=1) for images, targets in pbar: if torch.cuda.is_available(): @@ -130,8 +159,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a optimizer.step() scheduler.step() - pbar.set_description(f"Training loss: {train_loss.item():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.item():.6}") @torch.no_grad() @@ -170,6 +203,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() @@ -212,6 +246,9 @@ def main(args): collate_fn=val_set.collate_fn, ) print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)") + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)" + ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -227,6 +264,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): print(f"Resuming {args.resume}") + send_on_slack(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -306,6 +344,9 @@ def main(args): collate_fn=train_set.collate_fn, ) print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)") + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)" + ) with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -379,6 +420,7 @@ def main(args): val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), f"./{exp_name}.pt") min_loss = val_loss if args.save_interval_epoch: @@ -390,6 +432,7 @@ def main(args): else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" print(log_msg) + send_on_slack(log_msg) # W&B if args.wb: wandb.log( From f16b19232803b77d7220220e97fcfc11bf7be51c Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 6 Feb 2024 13:35:21 +0100 Subject: [PATCH 06/39] exclude l1_loss in db model --- doctr/models/detection/differentiable_binarization/pytorch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py index 17686bb28..8207b9a54 100644 --- a/doctr/models/detection/differentiable_binarization/pytorch.py +++ b/doctr/models/detection/differentiable_binarization/pytorch.py @@ -287,7 +287,8 @@ def compute_loss( if torch.any(thresh_mask): l1_loss = (torch.abs(thresh_map - thresh_target) * thresh_mask).sum() / (thresh_mask.sum() + eps) - return l1_loss + focal_scale * focal_loss + dice_loss + # return l1_loss + focal_scale * focal_loss + dice_loss + return focal_scale * focal_loss + dice_loss def _dbnet( From 9b0ae9226623b3608285707d0256647e6bad5a30 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 13 Feb 2024 10:15:54 +0100 Subject: [PATCH 07/39] send_on_slack tf --- references/detection/train_tensorflow.py | 46 ++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index fd0271c18..bed759187 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -12,6 +12,7 @@ import hashlib import multiprocessing as mp import time +from pathlib import Path import numpy as np import psutil @@ -31,6 +32,32 @@ from doctr.utils.metrics import LocalizationConfusion from utils import EarlyStopper, load_backbone, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + def record_lr( model: tf.keras.Model, @@ -87,6 +114,8 @@ def record_lr( def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): train_iter = iter(train_loader) # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_iter, position=1) for images, targets in pbar: images = batch_transforms(images) @@ -99,6 +128,11 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): optimizer.apply_gradients(zip(grads, model.trainable_weights)) pbar.set_description(f"Training loss: {train_loss.numpy():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.item():.6}") def evaluate(model, val_loader, batch_transforms, val_metric): @@ -129,6 +163,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() @@ -175,6 +210,10 @@ def main(args): f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{val_loader.num_batches} batches)" ) + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " + f"{val_loader.num_batches} batches)" + ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -264,6 +303,10 @@ def main(args): f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{train_loader.num_batches} batches)" ) + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " + f"{train_loader.num_batches} batches)" + ) with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -347,10 +390,12 @@ def main(args): val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric) if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(f"./{exp_name}/weights") min_loss = val_loss if args.save_interval_epoch: print(f"Saving state at epoch: {epoch + 1}") + send_on_slack(f"Saving state at epoch: {epoch + 1}") model.save_weights(f"./{exp_name}_{epoch + 1}/weights") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): @@ -358,6 +403,7 @@ def main(args): else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" print(log_msg) + send_on_slack(log_msg) # W&B if args.wb: wandb.log( From 20365993bb82ee43033c21e0782d7b56cef4604f Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 13 Feb 2024 10:59:40 +0100 Subject: [PATCH 08/39] Revert "fix test" This reverts commit 548772d131d88cea0fd4c2c650230a74ad2e9211. --- tests/pytorch/test_models_zoo_pt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pytorch/test_models_zoo_pt.py b/tests/pytorch/test_models_zoo_pt.py index 5bcd10ee6..3c6267ab7 100644 --- a/tests/pytorch/test_models_zoo_pt.py +++ b/tests/pytorch/test_models_zoo_pt.py @@ -222,9 +222,9 @@ def test_trained_kie_predictor(mock_payslip): geometry_mr = np.array([[0.1083984375, 0.0634765625], [0.1494140625, 0.0859375]]) assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][0].geometry), geometry_mr, rtol=0.05) - assert out.pages[0].predictions[CLASS_NAME][4].value == "revised" + assert out.pages[0].predictions[CLASS_NAME][6].value == "revised" geometry_revised = np.array([[0.7548828125, 0.126953125], [0.8388671875, 0.1484375]]) - assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][4].geometry), geometry_revised, rtol=0.05) + assert np.allclose(np.array(out.pages[0].predictions[CLASS_NAME][6].geometry), geometry_revised, rtol=0.05) det_predictor = detection_predictor( "db_resnet50", From c02a4779608e1585f98538b1bc33f951c624fab9 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Wed, 14 Feb 2024 09:47:11 +0100 Subject: [PATCH 09/39] fix --- references/detection/train_tensorflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index bed759187..0b61c4c29 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -132,7 +132,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): if current_progress - last_progress > interval_progress: send_on_slack(str(pbar)) last_progress = int(current_progress) - send_on_slack(f"Final training loss: {train_loss.item():.6}") + send_on_slack(f"Final training loss: {train_loss.numpy():.6}") def evaluate(model, val_loader, batch_transforms, val_metric): From 2602a5da533ea8be1d4af9f7c61a57cbf341dc77 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 19 Feb 2024 10:18:52 +0100 Subject: [PATCH 10/39] Display pbar before starting training --- references/detection/train_pytorch.py | 1 + references/detection/train_tensorflow.py | 1 + 2 files changed, 2 insertions(+) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index d23111f18..fccc092f1 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -136,6 +136,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a last_progress = 0 interval_progress = 5 pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 0b61c4c29..62788331f 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -117,6 +117,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): last_progress = 0 interval_progress = 5 pbar = tqdm(train_iter, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: images = batch_transforms(images) From 578d9b82a34b13c724645c3448ea123ffdd58240 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Wed, 6 Mar 2024 18:01:52 +0100 Subject: [PATCH 11/39] temp eval with cord funsd from felix --- references/detection/train_pytorch.py | 117 +++++++++++++++++++++++++- 1 file changed, 116 insertions(+), 1 deletion(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index fccc092f1..dc5263c1a 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -4,6 +4,10 @@ # See LICENSE or go to for full license details. import os +from pathlib import Path +from doctr.file_utils import CLASS_NAME +from doctr import datasets + os.environ["USE_TORCH"] = "1" @@ -201,6 +205,38 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): recall, precision, mean_iou = val_metric.summary() return val_loss, recall, precision, mean_iou +@torch.no_grad() +def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False): + # Model in eval mode + model.eval() + # Reset val metric + val_metric.reset() + # Validation loop + val_loss, batch_cnt = 0, 0 + for images, targets in tqdm(val_loader): + if torch.cuda.is_available(): + images = images.cuda() + images = batch_transforms(images) + targets = [{CLASS_NAME: t["boxes"]} for t in targets] + if amp: + with torch.cuda.amp.autocast(): + out = model(images, targets, return_preds=True) + else: + out = model(images, targets, return_preds=True) + # Compute metric + loc_preds = out["preds"] + for target, loc_pred in zip(targets, loc_preds): + for boxes_gt, boxes_pred in zip(target.values(), loc_pred.values()): + # Remove scores + val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :-1]) + + val_loss += out["loss"].item() + batch_cnt += 1 + + val_loss /= batch_cnt + recall, precision, mean_iou = val_metric.summary() + return val_loss, recall, precision, mean_iou + def main(args): print(args) @@ -255,6 +291,67 @@ def main(args): batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)) + funsd_ds = datasets.FUNSD( + train=True, + download=True, + use_polygons=args.rotation, + sample_transforms=T.Resize((args.input_size, args.input_size)), + ) + # Monkeypatch + subfolder = funsd_ds.root.split("/")[-2:] + funsd_ds.root = str(Path(funsd_ds.root).parent.parent) + funsd_ds.data = [(os.path.join(*subfolder, name), target) for name, target in funsd_ds.data] + _funsd_ds = datasets.FUNSD( + train=False, + download=True, + use_polygons=args.rotation, + sample_transforms=T.Resize((args.input_size, args.input_size)), + ) + subfolder = _funsd_ds.root.split("/")[-2:] + funsd_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _funsd_ds.data]) + + funsd_test_loader = DataLoader( + funsd_ds, + batch_size=args.batch_size, + drop_last=False, + num_workers=args.workers, + sampler=SequentialSampler(funsd_ds), + pin_memory=torch.cuda.is_available(), + collate_fn=funsd_ds.collate_fn, + ) + print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)") + + + cord_ds = datasets.CORD( + train=True, + download=True, + use_polygons=args.rotation, + sample_transforms=T.Resize((args.input_size, args.input_size)), + ) + # Monkeypatch + subfolder = cord_ds.root.split("/")[-2:] + cord_ds.root = str(Path(cord_ds.root).parent.parent) + cord_ds.data = [(os.path.join(*subfolder, name), target) for name, target in cord_ds.data] + _cord_ds = datasets.CORD( + train=False, + download=True, + use_polygons=args.rotation, + sample_transforms=T.Resize((args.input_size, args.input_size)), + ) + subfolder = _cord_ds.root.split("/")[-2:] + cord_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _cord_ds.data]) + + cord_test_loader = DataLoader( + cord_ds, + batch_size=args.batch_size, + drop_last=False, + num_workers=args.workers, + sampler=SequentialSampler(cord_ds), + pin_memory=torch.cuda.is_available(), + collate_fn=cord_ds.collate_fn, + ) + print(f"CORD Test set loaded in {time.time() - st:.4}s ({len(cord_ds)} samples in " f"{len(funsd_test_loader)} batches)") + # Load doctr model model = detection.__dict__[args.arch]( pretrained=args.pretrained, @@ -290,6 +387,16 @@ def main(args): mask_shape=(args.input_size, args.input_size), use_broadcasting=True if system_available_memory > 62 else False, ) + funsd_val_metric = LocalizationConfusion( + use_polygons=args.rotation and not args.eval_straight, + mask_shape=(args.input_size, args.input_size), + use_broadcasting=True if system_available_memory > 62 else False, + ) + cord_val_metric = LocalizationConfusion( + use_polygons=args.rotation and not args.eval_straight, + mask_shape=(args.input_size, args.input_size), + use_broadcasting=True if system_available_memory > 62 else False, + ) if args.test_only: print("Running evaluation") @@ -419,6 +526,12 @@ def main(args): fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) + _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate( + model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp + ) + _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate( + model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp + ) if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") @@ -431,7 +544,9 @@ def main(args): if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: - log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" + log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})\n" + log_msg += f"FUNSD: Recall: {funsd_recall:.2%} | Precision: {funsd_precision:.2%} | Mean IoU: {funsd_mean_iou:.2%}\n" + log_msg += f"CORD: Recall: {cord_recall:.2%} | Precision: {cord_precision:.2%} | Mean IoU: {cord_mean_iou:.2%}" print(log_msg) send_on_slack(log_msg) # W&B From ccf19d9aa62bb47b913a9108212fba784147463c Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Fri, 8 Mar 2024 14:15:49 +0100 Subject: [PATCH 12/39] try_except on sec_evaluate --- references/detection/train_pytorch.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index dc5263c1a..d4c2510db 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -526,12 +526,20 @@ def main(args): fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) - _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate( - model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp - ) - _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate( - model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp - ) + funsd_recall, funsd_precision, funsd_mean_iou = 0.0, 0.0, 0.0 + cord_recall, cord_precision, cord_mean_iou = 0.0, 0.0, 0.0 + try: + _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate( + model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp + ) + except Exception: + pass + try: + _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate( + model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp + ) + except Exception: + pass if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") From a5a6101a8ba76c2a6bc346a4954d031551ce3ef3 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Fri, 8 Mar 2024 14:19:44 +0100 Subject: [PATCH 13/39] pbar on evaluate fn --- references/detection/train_pytorch.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index d4c2510db..d33e932f6 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -178,9 +178,13 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): model.eval() # Reset val metric val_metric.reset() + last_progress = 0 + interval_progress = 5 + pbar = tqdm(val_loader) + send_on_slack(str(pbar)) # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -198,6 +202,10 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): boxes_pred = np.concatenate((boxes_pred.min(axis=1), boxes_pred.max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) val_loss += out["loss"].item() batch_cnt += 1 From cc795f604854ef66d05e60bacf01684ac30cce97 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Fri, 8 Mar 2024 14:22:02 +0100 Subject: [PATCH 14/39] pbar on sec_evaluate --- references/detection/train_pytorch.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index d33e932f6..2add7873c 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -219,9 +219,13 @@ def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False): model.eval() # Reset val metric val_metric.reset() + last_progress = 0 + interval_progress = 5 + pbar = tqdm(val_loader) + send_on_slack(str(pbar)) # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -238,6 +242,10 @@ def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False): # Remove scores val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :-1]) + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) val_loss += out["loss"].item() batch_cnt += 1 From 34e32ebea31b6626dbe61e576b8b22fe157554df Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 11 Mar 2024 19:04:10 +0100 Subject: [PATCH 15/39] apply patch from https://github.com/felixdittrich92/doctr/commit/27bc838a44784f1a6868693d55b12b3c5216d81c --- references/detection/train_pytorch.py | 120 ++++++++++---------------- 1 file changed, 44 insertions(+), 76 deletions(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 2add7873c..d43f610ab 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -213,46 +213,6 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): recall, precision, mean_iou = val_metric.summary() return val_loss, recall, precision, mean_iou -@torch.no_grad() -def sec_evaluate(model, val_loader, batch_transforms, val_metric, amp=False): - # Model in eval mode - model.eval() - # Reset val metric - val_metric.reset() - last_progress = 0 - interval_progress = 5 - pbar = tqdm(val_loader) - send_on_slack(str(pbar)) - # Validation loop - val_loss, batch_cnt = 0, 0 - for images, targets in pbar: - if torch.cuda.is_available(): - images = images.cuda() - images = batch_transforms(images) - targets = [{CLASS_NAME: t["boxes"]} for t in targets] - if amp: - with torch.cuda.amp.autocast(): - out = model(images, targets, return_preds=True) - else: - out = model(images, targets, return_preds=True) - # Compute metric - loc_preds = out["preds"] - for target, loc_pred in zip(targets, loc_preds): - for boxes_gt, boxes_pred in zip(target.values(), loc_pred.values()): - # Remove scores - val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :-1]) - - current_progress = pbar.n / pbar.total * 100 - if current_progress - last_progress > interval_progress: - send_on_slack(str(pbar)) - last_progress = int(current_progress) - val_loss += out["loss"].item() - batch_cnt += 1 - - val_loss /= batch_cnt - recall, precision, mean_iou = val_metric.summary() - return val_loss, recall, precision, mean_iou - def main(args): print(args) @@ -307,24 +267,27 @@ def main(args): batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)) - funsd_ds = datasets.FUNSD( - train=True, - download=True, - use_polygons=args.rotation, - sample_transforms=T.Resize((args.input_size, args.input_size)), - ) - # Monkeypatch - subfolder = funsd_ds.root.split("/")[-2:] - funsd_ds.root = str(Path(funsd_ds.root).parent.parent) - funsd_ds.data = [(os.path.join(*subfolder, name), target) for name, target in funsd_ds.data] - _funsd_ds = datasets.FUNSD( - train=False, - download=True, - use_polygons=args.rotation, - sample_transforms=T.Resize((args.input_size, args.input_size)), + funsd_ds = DetectionDataset( + img_folder=os.path.join(args.funsd_path, "images"), + label_path=os.path.join(args.funsd_path, "labels.json"), + sample_transforms=T.SampleCompose( + ( + [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] + if not args.rotation or args.eval_straight + else [] + ) + + ( + [ + T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad + T.RandomApply(T.RandomRotate(90, expand=True), 0.5), + T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), + ] + if args.rotation and not args.eval_straight + else [] + ) + ), + use_polygons=args.rotation and not args.eval_straight, ) - subfolder = _funsd_ds.root.split("/")[-2:] - funsd_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _funsd_ds.data]) funsd_test_loader = DataLoader( funsd_ds, @@ -338,24 +301,27 @@ def main(args): print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)") - cord_ds = datasets.CORD( - train=True, - download=True, - use_polygons=args.rotation, - sample_transforms=T.Resize((args.input_size, args.input_size)), - ) - # Monkeypatch - subfolder = cord_ds.root.split("/")[-2:] - cord_ds.root = str(Path(cord_ds.root).parent.parent) - cord_ds.data = [(os.path.join(*subfolder, name), target) for name, target in cord_ds.data] - _cord_ds = datasets.CORD( - train=False, - download=True, - use_polygons=args.rotation, - sample_transforms=T.Resize((args.input_size, args.input_size)), + cord_ds = DetectionDataset( + img_folder=os.path.join(args.cord_path, "images"), + label_path=os.path.join(args.cord_path, "labels.json"), + sample_transforms=T.SampleCompose( + ( + [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] + if not args.rotation or args.eval_straight + else [] + ) + + ( + [ + T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad + T.RandomApply(T.RandomRotate(90, expand=True), 0.5), + T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), + ] + if args.rotation and not args.eval_straight + else [] + ) + ), + use_polygons=args.rotation and not args.eval_straight, ) - subfolder = _cord_ds.root.split("/")[-2:] - cord_ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _cord_ds.data]) cord_test_loader = DataLoader( cord_ds, @@ -545,13 +511,13 @@ def main(args): funsd_recall, funsd_precision, funsd_mean_iou = 0.0, 0.0, 0.0 cord_recall, cord_precision, cord_mean_iou = 0.0, 0.0, 0.0 try: - _, funsd_recall, funsd_precision, funsd_mean_iou = sec_evaluate( + _, funsd_recall, funsd_precision, funsd_mean_iou = evaluate( model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp ) except Exception: pass try: - _, cord_recall, cord_precision, cord_mean_iou = sec_evaluate( + _, cord_recall, cord_precision, cord_mean_iou = evaluate( model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp ) except Exception: @@ -603,6 +569,8 @@ def parse_args(): parser.add_argument("train_path", type=str, help="path to training data folder") parser.add_argument("val_path", type=str, help="path to validation data folder") + parser.add_argument("funsd_path", type=str, help="path to FUNSD data folder") + parser.add_argument("cord_path", type=str, help="path to Cord data folder") parser.add_argument("arch", type=str, help="text-detection model to train") parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") From 2e90794343b7f9144a36908bb3b780db9795e080 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Wed, 20 Mar 2024 11:59:09 +0100 Subject: [PATCH 16/39] stop using custom ds for val --- references/detection/train_pytorch.py | 178 +++++++++++++------------- 1 file changed, 88 insertions(+), 90 deletions(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index d43f610ab..4cde7df4b 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -267,72 +267,72 @@ def main(args): batch_transforms = Normalize(mean=(0.798, 0.785, 0.772), std=(0.264, 0.2749, 0.287)) - funsd_ds = DetectionDataset( - img_folder=os.path.join(args.funsd_path, "images"), - label_path=os.path.join(args.funsd_path, "labels.json"), - sample_transforms=T.SampleCompose( - ( - [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] - if not args.rotation or args.eval_straight - else [] - ) - + ( - [ - T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad - T.RandomApply(T.RandomRotate(90, expand=True), 0.5), - T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), - ] - if args.rotation and not args.eval_straight - else [] - ) - ), - use_polygons=args.rotation and not args.eval_straight, - ) - - funsd_test_loader = DataLoader( - funsd_ds, - batch_size=args.batch_size, - drop_last=False, - num_workers=args.workers, - sampler=SequentialSampler(funsd_ds), - pin_memory=torch.cuda.is_available(), - collate_fn=funsd_ds.collate_fn, - ) - print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)") - - - cord_ds = DetectionDataset( - img_folder=os.path.join(args.cord_path, "images"), - label_path=os.path.join(args.cord_path, "labels.json"), - sample_transforms=T.SampleCompose( - ( - [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] - if not args.rotation or args.eval_straight - else [] - ) - + ( - [ - T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad - T.RandomApply(T.RandomRotate(90, expand=True), 0.5), - T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), - ] - if args.rotation and not args.eval_straight - else [] - ) - ), - use_polygons=args.rotation and not args.eval_straight, - ) - - cord_test_loader = DataLoader( - cord_ds, - batch_size=args.batch_size, - drop_last=False, - num_workers=args.workers, - sampler=SequentialSampler(cord_ds), - pin_memory=torch.cuda.is_available(), - collate_fn=cord_ds.collate_fn, - ) - print(f"CORD Test set loaded in {time.time() - st:.4}s ({len(cord_ds)} samples in " f"{len(funsd_test_loader)} batches)") + #funsd_ds = DetectionDataset( + # img_folder=os.path.join(args.funsd_path, "images"), + # label_path=os.path.join(args.funsd_path, "labels.json"), + # sample_transforms=T.SampleCompose( + # ( + # [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] + # if not args.rotation or args.eval_straight + # else [] + # ) + # + ( + # [ + # T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad + # T.RandomApply(T.RandomRotate(90, expand=True), 0.5), + # T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), + # ] + # if args.rotation and not args.eval_straight + # else [] + # ) + # ), + # use_polygons=args.rotation and not args.eval_straight, + #) + + #funsd_test_loader = DataLoader( + # funsd_ds, + # batch_size=args.batch_size, + # drop_last=False, + # num_workers=args.workers, + # sampler=SequentialSampler(funsd_ds), + # pin_memory=torch.cuda.is_available(), + # collate_fn=funsd_ds.collate_fn, + #) + #print(f"FUNSD Test set loaded in {time.time() - st:.4}s ({len(funsd_ds)} samples in " f"{len(funsd_test_loader)} batches)") + + + #cord_ds = DetectionDataset( + # img_folder=os.path.join(args.cord_path, "images"), + # label_path=os.path.join(args.cord_path, "labels.json"), + # sample_transforms=T.SampleCompose( + # ( + # [T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True)] + # if not args.rotation or args.eval_straight + # else [] + # ) + # + ( + # [ + # T.Resize(args.input_size, preserve_aspect_ratio=True), # This does not pad + # T.RandomApply(T.RandomRotate(90, expand=True), 0.5), + # T.Resize((args.input_size, args.input_size), preserve_aspect_ratio=True, symmetric_pad=True), + # ] + # if args.rotation and not args.eval_straight + # else [] + # ) + # ), + # use_polygons=args.rotation and not args.eval_straight, + #) + + #cord_test_loader = DataLoader( + # cord_ds, + # batch_size=args.batch_size, + # drop_last=False, + # num_workers=args.workers, + # sampler=SequentialSampler(cord_ds), + # pin_memory=torch.cuda.is_available(), + # collate_fn=cord_ds.collate_fn, + #) + #print(f"CORD Test set loaded in {time.time() - st:.4}s ({len(cord_ds)} samples in " f"{len(funsd_test_loader)} batches)") # Load doctr model model = detection.__dict__[args.arch]( @@ -369,16 +369,16 @@ def main(args): mask_shape=(args.input_size, args.input_size), use_broadcasting=True if system_available_memory > 62 else False, ) - funsd_val_metric = LocalizationConfusion( - use_polygons=args.rotation and not args.eval_straight, - mask_shape=(args.input_size, args.input_size), - use_broadcasting=True if system_available_memory > 62 else False, - ) - cord_val_metric = LocalizationConfusion( - use_polygons=args.rotation and not args.eval_straight, - mask_shape=(args.input_size, args.input_size), - use_broadcasting=True if system_available_memory > 62 else False, - ) + #funsd_val_metric = LocalizationConfusion( + # use_polygons=args.rotation and not args.eval_straight, + # mask_shape=(args.input_size, args.input_size), + # use_broadcasting=True if system_available_memory > 62 else False, + #) + #cord_val_metric = LocalizationConfusion( + # use_polygons=args.rotation and not args.eval_straight, + # mask_shape=(args.input_size, args.input_size), + # use_broadcasting=True if system_available_memory > 62 else False, + #) if args.test_only: print("Running evaluation") @@ -510,18 +510,18 @@ def main(args): val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) funsd_recall, funsd_precision, funsd_mean_iou = 0.0, 0.0, 0.0 cord_recall, cord_precision, cord_mean_iou = 0.0, 0.0, 0.0 - try: - _, funsd_recall, funsd_precision, funsd_mean_iou = evaluate( - model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp - ) - except Exception: - pass - try: - _, cord_recall, cord_precision, cord_mean_iou = evaluate( - model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp - ) - except Exception: - pass + #try: + # _, funsd_recall, funsd_precision, funsd_mean_iou = evaluate( + # model, funsd_test_loader, batch_transforms, funsd_val_metric, amp=args.amp + # ) + #except Exception: + # pass + #try: + # _, cord_recall, cord_precision, cord_mean_iou = evaluate( + # model, cord_test_loader, batch_transforms, cord_val_metric, amp=args.amp + # ) + #except Exception: + # pass if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") @@ -569,8 +569,6 @@ def parse_args(): parser.add_argument("train_path", type=str, help="path to training data folder") parser.add_argument("val_path", type=str, help="path to validation data folder") - parser.add_argument("funsd_path", type=str, help="path to FUNSD data folder") - parser.add_argument("cord_path", type=str, help="path to Cord data folder") parser.add_argument("arch", type=str, help="text-detection model to train") parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") From e6d393b015906f7e1d1ad5a6d86803dd74d7f86a Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 15 Apr 2024 23:09:36 +0200 Subject: [PATCH 17/39] train_pytorch_orientation send_on_slack --- .../train_pytorch_orientation.py | 57 ++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index 688e48564..82c2bd46a 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -11,6 +11,7 @@ import logging import multiprocessing as mp import time +from pathlib import Path import numpy as np import torch @@ -35,6 +36,33 @@ from doctr.models.utils import export_model_to_onnx from utils import EarlyStopper, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + + CLASSES = [0, 90, 180, 270] @@ -121,7 +149,10 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -146,15 +177,24 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.item():.6}") @torch.no_grad() def evaluate(model, val_loader, batch_transforms, amp=False): # Model in eval mode model.eval() + last_progress = 0 + interval_progress = 5 + pbar = tqdm(val_loader) + send_on_slack(str(pbar)) # Validation loop val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 - for images, targets in tqdm(val_loader): + for images, targets in pbar: images = batch_transforms(images) if torch.cuda.is_available(): @@ -175,6 +215,11 @@ def evaluate(model, val_loader, batch_transforms, amp=False): batch_cnt += 1 samples += images.shape[0] + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + val_loss /= batch_cnt acc = correct / samples return val_loss, acc @@ -214,6 +259,9 @@ def main(args): pin_memory=torch.cuda.is_available(), ) print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)") + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)" + ) batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -223,6 +271,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): print(f"Resuming {args.resume}") + send_on_slack(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -276,6 +325,9 @@ def main(args): pin_memory=torch.cuda.is_available(), ) print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)") + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)" + ) if args.show_samples: x, target = next(iter(train_loader)) @@ -338,9 +390,11 @@ def main(args): val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), f"./{exp_name}.pt") min_loss = val_loss print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: wandb.log({ @@ -349,6 +403,7 @@ def main(args): }) if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") + send_on_slack("Training halted early due to reaching patience limit.") break if args.wb: run.finish() From 05a677970e0e1c4bdc96410bac070c8cd578a1cc Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 15 Apr 2024 23:13:45 +0200 Subject: [PATCH 18/39] feat: :sparkles: orientation dataset walk --- doctr/datasets/orientation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doctr/datasets/orientation.py b/doctr/datasets/orientation.py index 10bd55444..11ebd806f 100644 --- a/doctr/datasets/orientation.py +++ b/doctr/datasets/orientation.py @@ -3,7 +3,7 @@ # This program is licensed under the Apache License 2.0. # See LICENSE or go to for full license details. -import os +from pathlib import Path from typing import Any, List, Tuple import numpy as np @@ -37,4 +37,6 @@ def __init__( ) # initialize dataset with 0 degree rotation targets - self.data: List[Tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)] + self.data: List[Tuple[str, np.ndarray]] = [ + (img_name, np.array([0])) for img_name in Path(self.root).rglob("*.jpg") + ] From bd18864bac054d11329809b12494d4a247bc90c3 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 16 Apr 2024 19:01:19 +0200 Subject: [PATCH 19/39] (32, 32) -> (128, 128) --- references/classification/train_pytorch_orientation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index 82c2bd46a..7f57a3dff 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -236,7 +236,7 @@ def main(args): torch.backends.cudnn.benchmark = True - input_size = (256, 256) if args.type == "page" else (32, 32) + input_size = (256, 256) if args.type == "page" else (128, 128) # Load val data generator st = time.time() From ea587106ad8dc6ead0314ecd458d22228e6b05ac Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 16 Apr 2024 19:02:37 +0200 Subject: [PATCH 20/39] (256, 256) -> (512, 512) --- references/classification/train_pytorch_orientation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index 7f57a3dff..2dbc5fc59 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -236,7 +236,7 @@ def main(args): torch.backends.cudnn.benchmark = True - input_size = (256, 256) if args.type == "page" else (128, 128) + input_size = (512, 512) if args.type == "page" else (128, 128) # Load val data generator st = time.time() From 1049fab08bd308e9ff68b015a16eb1de07e75634 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 16 Apr 2024 19:04:29 +0200 Subject: [PATCH 21/39] train_tensorflow_orientation.py: size for crop --- references/classification/train_tensorflow_orientation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index ed0479172..c01ce8371 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -147,7 +147,7 @@ def main(args): if not isinstance(args.workers, int): args.workers = min(16, mp.cpu_count()) - input_size = (256, 256) if args.type == "page" else (32, 32) + input_size = (512, 512) if args.type == "page" else (128, 128) # AMP if args.amp: From 53aa99bdf42beb56ba115a7ab3bdbf35f8f52df8 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 22 Apr 2024 16:20:37 +0200 Subject: [PATCH 22/39] slack display args on train_pytorch_orientation --- references/classification/train_pytorch_orientation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index 5b51fdc69..feca568e5 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -227,6 +227,7 @@ def evaluate(model, val_loader, batch_transforms, amp=False): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() From 4d751f1f20d99cefbd675651bb33125632871d57 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 30 Apr 2024 16:15:57 +0200 Subject: [PATCH 23/39] pbar `train_tensorflow_orientation` --- .../train_tensorflow_orientation.py | 58 ++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index a37ce8e1d..e7116595f 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -11,6 +11,7 @@ import datetime import multiprocessing as mp import time +from pathlib import Path import numpy as np import tensorflow as tf @@ -30,6 +31,32 @@ from doctr.transforms.functional import rotated_img_tensor from utils import EarlyStopper, plot_recorder, plot_samples +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + CLASSES = [0, -90, 180, 90] @@ -99,7 +126,10 @@ def record_lr( def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) for images, targets in pbar: images = batch_transforms(images) @@ -112,13 +142,22 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): optimizer.apply_gradients(zip(grads, model.trainable_weights)) pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(f"Final training loss: {train_loss.item():.6}") def evaluate(model, val_loader, batch_transforms): # Validation loop + last_progress = 0 + interval_progress = 5 val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter) + send_on_slack(str(pbar)) + for images, targets in pbar: images = batch_transforms(images) out = model(images, training=False) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) @@ -129,6 +168,11 @@ def evaluate(model, val_loader, batch_transforms): batch_cnt += 1 samples += images.shape[0] + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + val_loss /= batch_cnt acc = correct / samples return val_loss, acc @@ -143,6 +187,7 @@ def collate_fn(samples): def main(args): print(args) + send_on_slack(f"Start training: {args}") if args.push_to_hub: login_to_hub() @@ -180,6 +225,10 @@ def main(args): f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{val_loader.num_batches} batches)" ) + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " + f"{val_loader.num_batches} batches)" + ) # Load doctr model model = classification.__dict__[args.arch]( @@ -236,6 +285,10 @@ def main(args): f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{train_loader.num_batches} batches)" ) + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " + f"{train_loader.num_batches} batches)" + ) if args.show_samples: x, target = next(iter(train_loader)) @@ -307,9 +360,11 @@ def main(args): val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(f"./{exp_name}/weights") min_loss = val_loss print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: wandb.log({ @@ -326,6 +381,7 @@ def main(args): logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") + send_on_slack("Training halted early due to reaching patience limit.") break if args.wb: run.finish() From 9f034cb83f5b90d09815e12eb2cb2da4957ee2c1 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Wed, 22 May 2024 11:00:18 +0200 Subject: [PATCH 24/39] Frankenstein script to train TF model with Torch DataLoader --- ...train_tensorflow_orientation_from_torch.py | 471 ++++++++++++++++++ 1 file changed, 471 insertions(+) create mode 100644 references/classification/train_tensorflow_orientation_from_torch.py diff --git a/references/classification/train_tensorflow_orientation_from_torch.py b/references/classification/train_tensorflow_orientation_from_torch.py new file mode 100644 index 000000000..a4dec1286 --- /dev/null +++ b/references/classification/train_tensorflow_orientation_from_torch.py @@ -0,0 +1,471 @@ +# Copyright (C) 2021-2024, Mindee. + +# This program is licensed under the Apache License 2.0. +# See LICENSE or go to for full license details. + +import os + +os.environ["USE_TORCH"] = "1" + +import datetime +import logging +import multiprocessing as mp +import time +from pathlib import Path + +import numpy as np +import torch +import wandb +from torch.nn.functional import cross_entropy +from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler +from torchvision.transforms import functional as F +from torchvision.transforms.v2 import ( + Compose, + GaussianBlur, + Normalize, + RandomGrayscale, + RandomPerspective, + RandomPhotometricDistort, +) +from tqdm.auto import tqdm + +from doctr import transforms as T +from doctr.datasets import OrientationDataset +from doctr.models import classification, login_to_hub, push_to_hf_hub +from doctr.models.utils import export_model_to_onnx +from utils import EarlyStopper, plot_recorder, plot_samples + +SLACK_WEBHOOK_URL = None +SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) +if SLACK_WEBHOOK_PATH.exists(): + with open(SLACK_WEBHOOK_PATH) as f: + SLACK_WEBHOOK_URL = f.read().strip() +else: + print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") + + +def send_on_slack(text: str): + """Send a message on Slack. + + Args: + text (str): message to send on Slack + """ + if SLACK_WEBHOOK_URL: + try: + import requests + + requests.post( + url=SLACK_WEBHOOK_URL, + json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, + ) + except Exception: + print("Impossible to send message on Slack, continue...") + + +CLASSES = [0, -90, 180, 90] + + +def rnd_rotate(img: torch.Tensor, target): + angle = int(np.random.choice(CLASSES)) + idx = CLASSES.index(angle) + # augment the angle randomly with a probability of 0.5 + if np.random.rand() < 0.5: + angle += float(np.random.choice(np.arange(-25, 25, 5))) + rotated_img = F.rotate(img, angle=-angle, fill=0, expand=angle not in CLASSES)[:3] + return rotated_img, idx + + +def record_lr( + model: torch.nn.Module, + train_loader: DataLoader, + batch_transforms, + optimizer, + start_lr: float = 1e-7, + end_lr: float = 1, + num_it: int = 100, + amp: bool = False, +): + """Gridsearch the optimal learning rate for the training. + Adapted from https://github.com/frgfm/Holocron/blob/master/holocron/trainer/core.py + """ + if num_it > len(train_loader): + raise ValueError("the value of `num_it` needs to be lower than the number of available batches") + + model = model.train() + # Update param groups & LR + optimizer.defaults["lr"] = start_lr + for pgroup in optimizer.param_groups: + pgroup["lr"] = start_lr + + gamma = (end_lr / start_lr) ** (1 / (num_it - 1)) + scheduler = MultiplicativeLR(optimizer, lambda step: gamma) + + lr_recorder = [start_lr * gamma**idx for idx in range(num_it)] + loss_recorder = [] + + if amp: + scaler = torch.cuda.amp.GradScaler() + + for batch_idx, (images, targets) in enumerate(train_loader): + if torch.cuda.is_available(): + images = images.cuda() + targets = targets.cuda() + + images = batch_transforms(images) + + # Forward, Backward & update + optimizer.zero_grad() + if amp: + with torch.cuda.amp.autocast(): + out = model(images) + train_loss = cross_entropy(out, targets) + scaler.scale(train_loss).backward() + # Update the params + scaler.step(optimizer) + scaler.update() + else: + out = model(images) + train_loss = cross_entropy(out, targets) + train_loss.backward() + optimizer.step() + # Update LR + scheduler.step() + + # Record + if not torch.isfinite(train_loss): + if batch_idx == 0: + raise ValueError("loss value is NaN or inf.") + else: + break + loss_recorder.append(train_loss.item()) + # Stop after the number of iterations + if batch_idx + 1 == num_it: + break + + return lr_recorder[: len(loss_recorder)], loss_recorder + + +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): + # Iterate over the batches of the dataset + last_progress = 0 + interval_progress = 5 + pbar = tqdm(train_loader, position=1) + send_on_slack(str(pbar)) + import tensorflow as tf + for images, targets in pbar: + images = batch_transforms(images) + + images = tf.convert_to_tensor(images) + images = tf.transpose(images, (0, 3, 2, 1)) + with tf.GradientTape() as tape: + out = model(images, training=True) + train_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) + grads = tape.gradient(train_loss, model.trainable_weights) + if amp: + grads = optimizer.get_unscaled_gradients(grads) + optimizer.apply_gradients(zip(grads, model.trainable_weights)) + + pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + send_on_slack(str(pbar)) + #send_on_slack(f"Final training loss: {train_loss.item():.6}") + + +def evaluate(model, val_loader, batch_transforms): + # Validation loop + last_progress = 0 + interval_progress = 5 + val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 + val_iter = iter(val_loader) + pbar = tqdm(val_iter) + send_on_slack(str(pbar)) + import tensorflow as tf + for images, targets in pbar: + images = batch_transforms(images) + images = tf.convert_to_tensor(images) + images = tf.transpose(images, (0, 3, 2, 1)) + out = model(images, training=False) + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) + # Compute metric + correct += int((out.numpy().argmax(1) == targets.numpy()).sum()) + + val_loss += loss.numpy().mean() + batch_cnt += 1 + samples += images.shape[0] + + current_progress = pbar.n / pbar.total * 100 + if current_progress - last_progress > interval_progress: + send_on_slack(str(pbar)) + last_progress = int(current_progress) + + val_loss /= batch_cnt + acc = correct / samples + return val_loss, acc + + +def main(args): + print(args) + send_on_slack(f"Start training: {args}") + + if args.push_to_hub: + login_to_hub() + + if not isinstance(args.workers, int): + args.workers = min(16, mp.cpu_count()) + + torch.backends.cudnn.benchmark = True + + input_size = (512, 512) if args.type == "page" else (256, 256) + + # Load val data generator + st = time.time() + val_set = OrientationDataset( + img_folder=os.path.join(args.val_path, "images"), + img_transforms=Compose([ + T.Resize(input_size, preserve_aspect_ratio=True, symmetric_pad=True), + ]), + sample_transforms=T.SampleCompose([ + lambda x, y: rnd_rotate(x, y), + T.Resize(input_size), + ]), + ) + val_loader = DataLoader( + val_set, + batch_size=args.batch_size, + drop_last=False, + num_workers=args.workers, + sampler=SequentialSampler(val_set), + pin_memory=torch.cuda.is_available(), + ) + print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)") + send_on_slack( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in " f"{len(val_loader)} batches)" + ) + + batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) + + # Load doctr model + import doctr.models.classification.mobilenet.tensorflow as classification_tf + model = classification_tf.__dict__[args.arch](pretrained=args.pretrained, num_classes=len(CLASSES), classes=CLASSES) + + # Resume weights + if isinstance(args.resume, str): + print(f"Resuming {args.resume}") + send_on_slack(f"Resuming {args.resume}") + checkpoint = torch.load(args.resume, map_location="cpu") + model.load_state_dict(checkpoint) + + # GPU + #if isinstance(args.device, int): + # if not torch.cuda.is_available(): + # raise AssertionError("PyTorch cannot access your GPU. Please investigate!") + # if args.device >= torch.cuda.device_count(): + # raise ValueError("Invalid device index") + ## Silent default switch to GPU if available + #elif torch.cuda.is_available(): + # args.device = 0 + #else: + # logging.warning("No accessible GPU, targe device set to CPU.") + #if torch.cuda.is_available(): + # torch.cuda.set_device(args.device) + # model = model.cuda() + + if args.test_only: + print("Running evaluation") + val_loss, acc = evaluate(model, val_loader, batch_transforms) + print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + return + + st = time.time() + train_set = OrientationDataset( + img_folder=os.path.join(args.train_path, "images"), + img_transforms=Compose([ + T.Resize(input_size, preserve_aspect_ratio=True, symmetric_pad=True), + # Augmentations + T.RandomApply(T.ColorInversion(), 0.1), + T.RandomApply(T.GaussianNoise(mean=0.1, std=0.1), 0.1), + T.RandomApply(T.RandomShadow(), 0.2), + T.RandomApply(GaussianBlur(kernel_size=3), 0.1), + RandomPhotometricDistort(p=0.1), + RandomGrayscale(p=0.1), + RandomPerspective(distortion_scale=0.1, p=0.3), + ]), + sample_transforms=T.SampleCompose([ + lambda x, y: rnd_rotate(x, y), + T.Resize(input_size), + ]), + ) + + train_loader = DataLoader( + train_set, + batch_size=args.batch_size, + drop_last=True, + num_workers=args.workers, + sampler=RandomSampler(train_set), + pin_memory=torch.cuda.is_available(), + ) + print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)") + send_on_slack( + f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in " f"{len(train_loader)} batches)" + ) + + if args.show_samples: + x, target = next(iter(train_loader)) + plot_samples(x, [CLASSES[t] for t in target]) + return + + # Optimizer + #optimizer = torch.optim.Adam( + # [p for p in model.parameters() if p.requires_grad], + # args.lr, + # betas=(0.95, 0.99), + # eps=1e-6, + # weight_decay=args.weight_decay, + #) + import tensorflow as tf + scheduler = tf.keras.optimizers.schedules.ExponentialDecay( + args.lr, + decay_steps=args.epochs * len(train_loader), + decay_rate=1 / (1e3), # final lr as a fraction of initial lr + staircase=False, + name="ExponentialDecay", + ) + optimizer = tf.keras.optimizers.Adam( + learning_rate=scheduler, + beta_1=0.95, + beta_2=0.99, + epsilon=1e-6, + ) + + # LR Finder + if args.find_lr: + lrs, losses = record_lr(model, train_loader, batch_transforms, optimizer, amp=args.amp) + plot_recorder(lrs, losses) + return + ## Scheduler + #if args.sched == "cosine": + # scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4) + #elif args.sched == "onecycle": + # scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader)) + + # Training monitoring + current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + + # W&B + if args.wb: + run = wandb.init( + name=exp_name, + project="orientation-classification", + config={ + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": input_size, + "optimizer": "adam", + "framework": "pytorch", + "classes": CLASSES, + "scheduler": args.sched, + "pretrained": args.pretrained, + }, + ) + + # Create loss queue + min_loss = np.inf + # Training loop + if args.early_stop: + early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) + for epoch in range(args.epochs): + fit_one_epoch(model, train_loader, batch_transforms, optimizer) + model.save_weights(f"./{exp_name}_{epoch}/weights") + + try: + # Validation loop at the end of each epoch + val_loss, acc = evaluate(model, val_loader, batch_transforms) + if val_loss < min_loss: + print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + send_on_slack(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + model.save_weights(f"./{exp_name}/weights") + min_loss = val_loss + print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + send_on_slack(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + except Exception: + import traceback + traceback.print_exc() + ## W&B + #if args.wb: + # wandb.log({ + # "val_loss": val_loss, + # "acc": acc, + # }) + #if args.early_stop and early_stopper.early_stop(val_loss): + # print("Training halted early due to reaching patience limit.") + # send_on_slack("Training halted early due to reaching patience limit.") + # break + if args.wb: + run.finish() + + if args.push_to_hub: + push_to_hf_hub(model, exp_name, task="classification", run_config=args) + + if args.export_onnx: + print("Exporting model to ONNX...") + dummy_batch = next(iter(val_loader)) + dummy_input = dummy_batch[0].cuda() if torch.cuda.is_available() else dummy_batch[0] + model_path = export_model_to_onnx(model, exp_name, dummy_input) + print(f"Exported model saved in {model_path}") + + +def parse_args(): + import argparse + + parser = argparse.ArgumentParser( + description="DocTR training script for orientation classification (PyTorch)", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument("train_path", type=str, help="path to training data folder") + parser.add_argument("val_path", type=str, help="path to validation data folder") + parser.add_argument("arch", type=str, help="classification model to train") + parser.add_argument("type", type=str, choices=["page", "crop"], help="type of data to train on") + parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") + parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") + parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training") + parser.add_argument("--device", default=None, type=int, help="device") + parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam)") + parser.add_argument("--wd", "--weight-decay", default=0, type=float, help="weight decay", dest="weight_decay") + parser.add_argument("-j", "--workers", type=int, default=None, help="number of workers used for dataloading") + parser.add_argument("--resume", type=str, default=None, help="Path to your checkpoint") + parser.add_argument("--test-only", dest="test_only", action="store_true", help="Run the validation loop") + parser.add_argument( + "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" + ) + parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") + parser.add_argument( + "--pretrained", + dest="pretrained", + action="store_true", + help="Load pretrained parameters before starting the training", + ) + parser.add_argument("--export-onnx", dest="export_onnx", action="store_true", help="Export the model to ONNX") + parser.add_argument("--sched", type=str, default="cosine", help="scheduler to use") + parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true") + parser.add_argument("--find-lr", action="store_true", help="Gridsearch the optimal LR") + parser.add_argument("--early-stop", action="store_true", help="Enable early stopping") + parser.add_argument("--early-stop-epochs", type=int, default=5, help="Patience for early stopping") + parser.add_argument("--early-stop-delta", type=float, default=0.01, help="Minimum Delta for early stopping") + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + args = parse_args() + main(args) From e56b7c07d9f46baa289bbc352e10826fd7cf9bb2 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Wed, 5 Jun 2024 07:36:43 +0000 Subject: [PATCH 25/39] fix send_on_slack --- references/classification/train_tensorflow_orientation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index e7116595f..a5ed27c19 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -146,7 +146,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): if current_progress - last_progress > interval_progress: send_on_slack(str(pbar)) last_progress = int(current_progress) - send_on_slack(f"Final training loss: {train_loss.item():.6}") + send_on_slack(f"Final training loss: {train_loss.numpy():.6}") def evaluate(model, val_loader, batch_transforms): From 983e815ecf549eb53a7a7c2473cf9605b0ce202b Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Wed, 5 Jun 2024 12:00:05 +0000 Subject: [PATCH 26/39] upd --- references/classification/train_tensorflow_orientation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index a5ed27c19..16048161f 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -146,7 +146,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): if current_progress - last_progress > interval_progress: send_on_slack(str(pbar)) last_progress = int(current_progress) - send_on_slack(f"Final training loss: {train_loss.numpy():.6}") + send_on_slack(f"Final training loss: {train_loss.numpy().mean():.6}") def evaluate(model, val_loader, batch_transforms): From 1efc8ee93a155a8cbe04fc4e66d3b3bc1c4fc77f Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 17 Dec 2024 15:18:37 +0100 Subject: [PATCH 27/39] revert few changes --- doctr/datasets/datasets/tensorflow.py | 2 +- doctr/models/detection/differentiable_binarization/pytorch.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doctr/datasets/datasets/tensorflow.py b/doctr/datasets/datasets/tensorflow.py index 203c770eb..f0206cf03 100644 --- a/doctr/datasets/datasets/tensorflow.py +++ b/doctr/datasets/datasets/tensorflow.py @@ -52,7 +52,7 @@ def collate_fn(samples: list[tuple[tf.Tensor, Any]]) -> tuple[tf.Tensor, list[An images, targets = zip(*samples) images = tf.stack(images, axis=0) - return images, targets + return images, list(targets) class VisionDataset(AbstractDataset, _VisionDataset): # noqa: D101 diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py index 74e68a340..cad6a74aa 100644 --- a/doctr/models/detection/differentiable_binarization/pytorch.py +++ b/doctr/models/detection/differentiable_binarization/pytorch.py @@ -286,8 +286,7 @@ def compute_loss( if torch.any(thresh_mask): l1_loss = (torch.abs(thresh_map - thresh_target) * thresh_mask).sum() / (thresh_mask.sum() + eps) - # return l1_loss + focal_scale * focal_loss + dice_loss - return focal_scale * focal_loss + dice_loss + return l1_loss + focal_scale * focal_loss + dice_loss def _dbnet( From 60a63bbbbfa7b6193aed982e2cdcd9948d7c5690 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 14 Jan 2025 13:51:16 +0100 Subject: [PATCH 28/39] clean `detection/train_pytorch.py` --- references/detection/train_pytorch.py | 40 ++++++--------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 10e4a3f3f..ccc6f52a4 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -25,20 +25,14 @@ else: from tqdm.auto import tqdm +from slack_sdk import WebClient + from doctr import transforms as T from doctr.datasets import DetectionDataset from doctr.models import detection, login_to_hub, push_to_hf_hub from doctr.utils.metrics import LocalizationConfusion from utils import EarlyStopper, plot_recorder, plot_samples -SLACK_WEBHOOK_URL = None -SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) -if SLACK_WEBHOOK_PATH.exists(): - with open(SLACK_WEBHOOK_PATH) as f: - SLACK_WEBHOOK_URL = f.read().strip() -else: - print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") - def send_on_slack(text: str): """Send a message on Slack. @@ -46,16 +40,12 @@ def send_on_slack(text: str): Args: text (str): message to send on Slack """ - if SLACK_WEBHOOK_URL: - try: - import requests - - requests.post( - url=SLACK_WEBHOOK_URL, - json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, - ) - except Exception: - print("Impossible to send message on Slack, continue...") + if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"): + client = WebClient(token=os.getenv("TQDM_SLACK_TOKEN")) + client.chat_postMessage( + channel=os.getenv("TQDM_SLACK_CHANNEL"), + text=f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}", + ) def record_lr( @@ -135,10 +125,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset - last_progress = 0 - interval_progress = 5 pbar = tqdm(train_loader, position=1) - send_on_slack(str(pbar)) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -163,10 +150,6 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") - current_progress = pbar.n / pbar.total * 100 - if current_progress - last_progress > interval_progress: - send_on_slack(str(pbar)) - last_progress = int(current_progress) send_on_slack(f"Final training loss: {train_loss.item():.6}") @@ -176,10 +159,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): model.eval() # Reset val metric val_metric.reset() - last_progress = 0 - interval_progress = 5 pbar = tqdm(val_loader) - send_on_slack(str(pbar)) # Validation loop val_loss, batch_cnt = 0, 0 for images, targets in pbar: @@ -200,10 +180,6 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) - current_progress = pbar.n / pbar.total * 100 - if current_progress - last_progress > interval_progress: - send_on_slack(str(pbar)) - last_progress = int(current_progress) val_loss += out["loss"].item() batch_cnt += 1 From 4dcd9724a0878d55ca1321ba3ab28d4ad8b01ade Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 14 Jan 2025 15:43:02 +0100 Subject: [PATCH 29/39] add clearml logging --- references/detection/train_pytorch.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index ccc6f52a4..bcfdfee5a 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -496,6 +496,14 @@ def main(args): }, ) + # ClearML + if args.clearml: + from clearml import Task + + task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False) + task.upload_artifact("config", config) + + # Create loss queue min_loss = np.inf if args.early_stop: @@ -547,6 +555,16 @@ def main(args): "precision": precision, "mean_iou": mean_iou, }) + + # ClearML + if args.clearml: + from clearml import Logger + + logger = Logger.current_logger() + logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Precision Recall", series="recall", value=recall, iteration=epoch) + logger.report_scalar(title="Precision Recall", series="precision", value=precision, iteration=epoch) + logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break @@ -589,6 +607,7 @@ def parse_args(): "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" ) parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML") parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") parser.add_argument( "--pretrained", From e182ae7a005e4c0c5ebb3da3f0b52b3ea4118a5d Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 14 Jan 2025 16:19:26 +0100 Subject: [PATCH 30/39] add boto3 --- references/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/references/requirements.txt b/references/requirements.txt index 90e24543d..84d95095b 100644 --- a/references/requirements.txt +++ b/references/requirements.txt @@ -1,6 +1,7 @@ -e . tqdm slack-sdk +boto3>=1.9 wandb>=0.10.31 clearml>=1.11.1 matplotlib>=3.1.0 From 6494f87a8b0b862cbc7597941db7abd068b9b567 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 14 Jan 2025 17:02:40 +0100 Subject: [PATCH 31/39] `config` --- references/detection/train_pytorch.py | 33 ++++++++++++++------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index bcfdfee5a..7ef20f5dd 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -470,6 +470,22 @@ def main(args): # Training monitoring current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + config = { + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": args.input_size, + "optimizer": args.optim, + "framework": "pytorch", + "scheduler": args.sched, + "train_hash": train_hash, + "val_hash": val_hash, + "pretrained": args.pretrained, + "rotation": args.rotation, + "amp": args.amp, + } # W&B if args.wb: @@ -478,22 +494,7 @@ def main(args): run = wandb.init( name=exp_name, project="text-detection", - config={ - "learning_rate": args.lr, - "epochs": args.epochs, - "weight_decay": args.weight_decay, - "batch_size": args.batch_size, - "architecture": args.arch, - "input_size": args.input_size, - "optimizer": args.optim, - "framework": "pytorch", - "scheduler": args.sched, - "train_hash": train_hash, - "val_hash": val_hash, - "pretrained": args.pretrained, - "rotation": args.rotation, - "amp": args.amp, - }, + config=config, ) # ClearML From 2a12cfc497ba2e5b2dbbbe9f0719fc76b95d8ccd Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Thu, 16 Jan 2025 18:13:53 +0100 Subject: [PATCH 32/39] Grad accumulation - testing --- references/recognition/train_pytorch.py | 41 ++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/references/recognition/train_pytorch.py b/references/recognition/train_pytorch.py index 70a841dce..ba443684d 100644 --- a/references/recognition/train_pytorch.py +++ b/references/recognition/train_pytorch.py @@ -109,7 +109,7 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() @@ -121,37 +121,37 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) - for images, targets in pbar: + for step, (images, targets) in enumerate(pbar): if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) - train_loss = model(images, targets)["loss"] - - optimizer.zero_grad() if amp: with torch.cuda.amp.autocast(): - train_loss = model(images, targets)["loss"] + train_loss = model(images, targets)["loss"] / grad_accumulation_steps scaler.scale(train_loss).backward() # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 5) - # Update the params - scaler.step(optimizer) - scaler.update() else: - train_loss = model(images, targets)["loss"] + train_loss = model(images, targets)["loss"] / grad_accumulation_steps train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) - optimizer.step() - scheduler.step() + if (step + 1) % grad_accumulation_steps == 0 or step + 1 == len(train_loader): + if amp: + scaler.step(optimizer) + scaler.update() + else: + optimizer.step() + optimizer.zero_grad() + scheduler.step() - pbar.set_description(f"Training loss: {train_loss.item():.6}") + pbar.set_description(f"Training loss: {train_loss.item() * grad_accumulation_steps:.6f}") if clearml_log: global iteration logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + title="Training Loss", series="train_loss", value=train_loss.item() * grad_accumulation_steps, iteration=iteration ) iteration += 1 @@ -376,12 +376,16 @@ def main(args): return # Scheduler + # Effective steps per epoch (due to grad accumulation) + grad_steps = args.grad_accumulation + effective_steps_per_epoch = len(train_loader) // grad_steps + total_steps = args.epochs * effective_steps_per_epoch if args.sched == "cosine": - scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4) + scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4) elif args.sched == "onecycle": - scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader)) + scheduler = OneCycleLR(optimizer, args.lr, total_steps) elif args.sched == "poly": - scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader)) + scheduler = PolynomialLR(optimizer, total_steps) # Training monitoring current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") @@ -429,7 +433,7 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps=grad_steps, amp=args.amp, clearml_log=args.clearml ) # Validation loop at the end of each epoch @@ -501,6 +505,7 @@ def parse_args(): parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size for training") + parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps") parser.add_argument("--device", default=None, type=int, help="device") parser.add_argument("--input_size", type=int, default=32, help="input size H for the model, W = 4*H") parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)") From 8f5ccf9986c8bf1447e896715e7f04d20dadb1f1 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Thu, 16 Jan 2025 18:31:20 +0100 Subject: [PATCH 33/39] Revert "Grad accumulation - testing" This reverts commit 2a12cfc497ba2e5b2dbbbe9f0719fc76b95d8ccd. --- references/recognition/train_pytorch.py | 41 +++++++++++-------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/references/recognition/train_pytorch.py b/references/recognition/train_pytorch.py index ba443684d..70a841dce 100644 --- a/references/recognition/train_pytorch.py +++ b/references/recognition/train_pytorch.py @@ -109,7 +109,7 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() @@ -121,37 +121,37 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, g # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) - for step, (images, targets) in enumerate(pbar): + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) + train_loss = model(images, targets)["loss"] + + optimizer.zero_grad() if amp: with torch.cuda.amp.autocast(): - train_loss = model(images, targets)["loss"] / grad_accumulation_steps + train_loss = model(images, targets)["loss"] scaler.scale(train_loss).backward() # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 5) + # Update the params + scaler.step(optimizer) + scaler.update() else: - train_loss = model(images, targets)["loss"] / grad_accumulation_steps + train_loss = model(images, targets)["loss"] train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) + optimizer.step() - if (step + 1) % grad_accumulation_steps == 0 or step + 1 == len(train_loader): - if amp: - scaler.step(optimizer) - scaler.update() - else: - optimizer.step() - optimizer.zero_grad() - scheduler.step() + scheduler.step() - pbar.set_description(f"Training loss: {train_loss.item() * grad_accumulation_steps:.6f}") + pbar.set_description(f"Training loss: {train_loss.item():.6}") if clearml_log: global iteration logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item() * grad_accumulation_steps, iteration=iteration + title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration ) iteration += 1 @@ -376,16 +376,12 @@ def main(args): return # Scheduler - # Effective steps per epoch (due to grad accumulation) - grad_steps = args.grad_accumulation - effective_steps_per_epoch = len(train_loader) // grad_steps - total_steps = args.epochs * effective_steps_per_epoch if args.sched == "cosine": - scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4) + scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4) elif args.sched == "onecycle": - scheduler = OneCycleLR(optimizer, args.lr, total_steps) + scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader)) elif args.sched == "poly": - scheduler = PolynomialLR(optimizer, total_steps) + scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader)) # Training monitoring current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") @@ -433,7 +429,7 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps=grad_steps, amp=args.amp, clearml_log=args.clearml + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml ) # Validation loop at the end of each epoch @@ -505,7 +501,6 @@ def parse_args(): parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size for training") - parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps") parser.add_argument("--device", default=None, type=int, help="device") parser.add_argument("--input_size", type=int, default=32, help="input size H for the model, W = 4*H") parser.add_argument("--lr", type=float, default=0.001, help="learning rate for the optimizer (Adam or AdamW)") From 934730ebf789757f0dcbefa374134d3d99e7ab30 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Thu, 16 Jan 2025 18:34:12 +0100 Subject: [PATCH 34/39] Grad accumulation - testing --- references/detection/train_pytorch.py | 41 ++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index d6af3f69a..80b05d427 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -119,7 +119,7 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, grad_accumulation_steps, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() @@ -131,34 +131,38 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) - for images, targets in pbar: + for step, (images, targets) in enumerate(pbar): if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) - optimizer.zero_grad() if amp: with torch.cuda.amp.autocast(): - train_loss = model(images, targets)["loss"] + train_loss = model(images, targets)["loss"] / grad_accumulation_steps scaler.scale(train_loss).backward() # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 5) - # Update the params - scaler.step(optimizer) - scaler.update() else: - train_loss = model(images, targets)["loss"] + train_loss = model(images, targets)["loss"] / grad_accumulation_steps train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) - optimizer.step() - scheduler.step() - pbar.set_description(f"Training loss: {train_loss.item():.6}") + if (step + 1) % grad_accumulation_steps == 0 or step + 1 == len(train_loader): + if amp: + scaler.step(optimizer) + scaler.update() + else: + optimizer.step() + + optimizer.zero_grad() + scheduler.step() + + pbar.set_description(f"Training loss: {train_loss.item() * grad_accumulation_steps:.6f}") if clearml_log: global iteration logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + title="Training Loss", series="train_loss", value=train_loss.item() * grad_accumulation_steps, iteration=iteration ) iteration += 1 send_on_slack(f"Final training loss: {train_loss.item():.6}") @@ -471,12 +475,16 @@ def main(args): return # Scheduler + # Effective steps per epoch (due to grad accumulation) + grad_steps = args.grad_accumulation + effective_steps_per_epoch = len(train_loader) // grad_steps + total_steps = args.epochs * effective_steps_per_epoch if args.sched == "cosine": - scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4) + scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4) elif args.sched == "onecycle": - scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader)) + scheduler = OneCycleLR(optimizer, args.lr, total_steps) elif args.sched == "poly": - scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader)) + scheduler = PolynomialLR(optimizer, total_steps) # Training monitoring current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") @@ -525,7 +533,7 @@ def main(args): # Training loop for epoch in range(args.epochs): fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + model, train_loader, batch_transforms, optimizer, scheduler, grad_steps, amp=args.amp, clearml_log=args.clearml ) # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) @@ -606,6 +614,7 @@ def parse_args(): parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training") + parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps") parser.add_argument("--device", default=None, type=int, help="device") parser.add_argument( "--save-interval-epoch", dest="save_interval_epoch", action="store_true", help="Save model every epoch" From d3bcd09f6ea4a77d1bb6b39cd2d044070e75fd6c Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Tue, 21 Jan 2025 16:16:22 +0100 Subject: [PATCH 35/39] `power=0.5` for polynomialLR --- references/detection/train_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 80b05d427..610cbe2ae 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -484,7 +484,7 @@ def main(args): elif args.sched == "onecycle": scheduler = OneCycleLR(optimizer, args.lr, total_steps) elif args.sched == "poly": - scheduler = PolynomialLR(optimizer, total_steps) + scheduler = PolynomialLR(optimizer, total_steps, power=0.5) # Training monitoring current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") From b912d8fcd81b37ed7ccc6f7c13604ef18eca3d51 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Thu, 23 Jan 2025 15:37:26 +0100 Subject: [PATCH 36/39] clean branch --- .../train_pytorch_orientation.py | 36 ----------------- .../train_tensorflow_orientation.py | 40 ------------------- references/detection/train_pytorch.py | 19 +-------- references/detection/train_tensorflow.py | 32 --------------- 4 files changed, 1 insertion(+), 126 deletions(-) diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index beb63c8f5..13df5f843 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -38,33 +38,6 @@ from doctr.models.utils import export_model_to_onnx from utils import EarlyStopper, plot_recorder, plot_samples -SLACK_WEBHOOK_URL = None -SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) -if SLACK_WEBHOOK_PATH.exists(): - with open(SLACK_WEBHOOK_PATH) as f: - SLACK_WEBHOOK_URL = f.read().strip() -else: - print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") - - -def send_on_slack(text: str): - """Send a message on Slack. - - Args: - text (str): message to send on Slack - """ - if SLACK_WEBHOOK_URL: - try: - import requests - - requests.post( - url=SLACK_WEBHOOK_URL, - json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, - ) - except Exception: - print("Impossible to send message on Slack, continue...") - - CLASSES = [0, -90, 180, 90] @@ -195,10 +168,6 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a def evaluate(model, val_loader, batch_transforms, amp=False, log=None): # Model in eval mode model.eval() - last_progress = 0 - interval_progress = 5 - pbar = tqdm(val_loader) - send_on_slack(str(pbar)) # Validation loop val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 pbar = tqdm(val_loader, dynamic_ncols=True) @@ -226,11 +195,6 @@ def evaluate(model, val_loader, batch_transforms, amp=False, log=None): batch_cnt += 1 samples += images.shape[0] - current_progress = pbar.n / pbar.total * 100 - if current_progress - last_progress > interval_progress: - send_on_slack(str(pbar)) - last_progress = int(current_progress) - val_loss /= batch_cnt acc = correct / samples return val_loss, acc diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index b133ccbfe..87c46b62d 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -38,33 +38,6 @@ from doctr.transforms.functional import rotated_img_tensor from utils import EarlyStopper, plot_recorder, plot_samples -SLACK_WEBHOOK_URL = None -SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) -if SLACK_WEBHOOK_PATH.exists(): - with open(SLACK_WEBHOOK_PATH) as f: - SLACK_WEBHOOK_URL = f.read().strip() -else: - print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") - - -def send_on_slack(text: str): - """Send a message on Slack. - - Args: - text (str): message to send on Slack - """ - if SLACK_WEBHOOK_URL: - try: - import requests - - requests.post( - url=SLACK_WEBHOOK_URL, - json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, - ) - except Exception: - print("Impossible to send message on Slack, continue...") - - CLASSES = [0, -90, 180, 90] @@ -168,8 +141,6 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, l def evaluate(model, val_loader, batch_transforms, log=None): # Validation loop - last_progress = 0 - interval_progress = 5 val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 val_iter = iter(val_loader) pbar = tqdm(val_iter, dynamic_ncols=True) @@ -187,11 +158,6 @@ def evaluate(model, val_loader, batch_transforms, log=None): batch_cnt += 1 samples += images.shape[0] - current_progress = pbar.n / pbar.total * 100 - if current_progress - last_progress > interval_progress: - send_on_slack(str(pbar)) - last_progress = int(current_progress) - val_loss /= batch_cnt acc = correct / samples return val_loss, acc @@ -239,9 +205,6 @@ def main(args): pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) - send_on_slack( - f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" - ) # Load doctr model model = classification.__dict__[args.arch]( @@ -296,9 +259,6 @@ def main(args): pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) - send_on_slack( - f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" - ) if args.show_samples: x, target = next(iter(train_loader)) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 29094d0d9..2d72bdde7 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -4,7 +4,6 @@ # See LICENSE or go to for full license details. import os -from pathlib import Path os.environ["USE_TORCH"] = "1" @@ -13,6 +12,7 @@ import logging import multiprocessing as mp import time +from pathlib import Path import numpy as np import torch @@ -25,8 +25,6 @@ else: from tqdm.auto import tqdm -from slack_sdk import WebClient - from doctr import transforms as T from doctr.datasets import DetectionDataset from doctr.models import detection, login_to_hub, push_to_hf_hub @@ -34,20 +32,6 @@ from utils import EarlyStopper, plot_recorder, plot_samples -def send_on_slack(text: str): - """Send a message on Slack. - - Args: - text (str): message to send on Slack - """ - if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"): - client = WebClient(token=os.getenv("TQDM_SLACK_TOKEN")) - client.chat_postMessage( - channel=os.getenv("TQDM_SLACK_CHANNEL"), - text=f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}", - ) - - def record_lr( model: torch.nn.Module, train_loader: DataLoader, @@ -163,7 +147,6 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=Non model.eval() # Reset val metric val_metric.reset() - pbar = tqdm(val_loader) # Validation loop val_loss, batch_cnt = 0, 0 pbar = tqdm(val_loader, dynamic_ncols=True) diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 2fe3de5bb..dde8d5eb5 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -38,32 +38,6 @@ from doctr.utils.metrics import LocalizationConfusion from utils import EarlyStopper, plot_recorder, plot_samples -SLACK_WEBHOOK_URL = None -SLACK_WEBHOOK_PATH = Path(os.path.join(os.path.expanduser("~"), ".config", "doctr", "slack_webhook_url.txt")) -if SLACK_WEBHOOK_PATH.exists(): - with open(SLACK_WEBHOOK_PATH) as f: - SLACK_WEBHOOK_URL = f.read().strip() -else: - print(f"{SLACK_WEBHOOK_PATH} does not exist, skip Slack integration configuration...") - - -def send_on_slack(text: str): - """Send a message on Slack. - - Args: - text (str): message to send on Slack - """ - if SLACK_WEBHOOK_URL: - try: - import requests - - requests.post( - url=SLACK_WEBHOOK_URL, - json={"text": f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]: {text}"}, - ) - except Exception: - print("Impossible to send message on Slack, continue...") - def record_lr( model: Model, @@ -222,9 +196,6 @@ def main(args): pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) - send_on_slack( - f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" - ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -320,9 +291,6 @@ def main(args): pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) - send_on_slack( - f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" - ) with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() From 5548afa982599264564723f92a7e03f9acc4fd61 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 27 Jan 2025 11:53:33 +0100 Subject: [PATCH 37/39] remove grad accumu --- references/detection/train_pytorch.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 2d72bdde7..78bf92e97 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -116,17 +116,22 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a images = images.cuda() images = batch_transforms(images) + optimizer.zero_grad() if amp: with torch.cuda.amp.autocast(): - train_loss = model(images, targets)["loss"] / grad_accumulation_steps + train_loss = model(images, targets)["loss"] scaler.scale(train_loss).backward() # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 5) + # Update the params + scaler.step(optimizer) + scaler.update() else: - train_loss = model(images, targets)["loss"] / grad_accumulation_steps + train_loss = model(images, targets)["loss"] train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) + optimizer.step() scheduler.step() last_lr = scheduler.get_last_lr()[0] @@ -444,16 +449,12 @@ def main(args): return # Scheduler - # Effective steps per epoch (due to grad accumulation) - grad_steps = args.grad_accumulation - effective_steps_per_epoch = len(train_loader) // grad_steps - total_steps = args.epochs * effective_steps_per_epoch if args.sched == "cosine": - scheduler = CosineAnnealingLR(optimizer, total_steps, eta_min=args.lr / 25e4) + scheduler = CosineAnnealingLR(optimizer, args.epochs * len(train_loader), eta_min=args.lr / 25e4) elif args.sched == "onecycle": - scheduler = OneCycleLR(optimizer, args.lr, total_steps) + scheduler = OneCycleLR(optimizer, args.lr, args.epochs * len(train_loader)) elif args.sched == "poly": - scheduler = PolynomialLR(optimizer, total_steps, power=0.5) + scheduler = PolynomialLR(optimizer, args.epochs * len(train_loader)) # Training monitoring current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") @@ -608,7 +609,6 @@ def parse_args(): parser.add_argument("--name", type=str, default=None, help="Name of your training experiment") parser.add_argument("--epochs", type=int, default=10, help="number of epochs to train the model on") parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for training") - parser.add_argument("--grad_accumulation", type=int, default=1, help="gradient accumulation steps") parser.add_argument("--device", default=None, type=int, help="device") parser.add_argument( "--save-interval-epoch", dest="save_interval_epoch", action="store_true", help="Save model every epoch" From 1162d8b89b80ef3d53ea57c3449b1bf975eef806 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 27 Jan 2025 12:11:39 +0100 Subject: [PATCH 38/39] tqdm disable --- references/detection/train_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 78bf92e97..d33734821 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -185,7 +185,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=Non def main(args): - pbar = tqdm(disable=True) + pbar = tqdm(disable=False) pbar.write(str(args)) if args.push_to_hub: From 185ce11f61001a69e88c71490013f82161b5aae4 Mon Sep 17 00:00:00 2001 From: Olivier Dulcy Date: Mon, 27 Jan 2025 15:10:36 +0100 Subject: [PATCH 39/39] enable pbar.write --- references/detection/train_pytorch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index d33734821..b7a49db21 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -186,6 +186,9 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=Non def main(args): pbar = tqdm(disable=False) + # Monkey patch tqdm write method to send messages directly to Slack + if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"): + pbar.write = lambda msg: pbar.sio.client.chat_postMessage(channel=os.getenv("TQDM_SLACK_CHANNEL"), text=msg) pbar.write(str(args)) if args.push_to_hub: