diff --git a/references/classification/train_pytorch_character.py b/references/classification/train_pytorch_character.py index 51b390f7b..de4b19731 100644 --- a/references/classification/train_pytorch_character.py +++ b/references/classification/train_pytorch_character.py @@ -110,11 +110,16 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -141,6 +146,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + ) + iteration += 1 @torch.no_grad() @@ -318,6 +329,20 @@ def main(args): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + config = { + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": args.input_size, + "optimizer": args.optim, + "framework": "pytorch", + "vocab": args.vocab, + "scheduler": args.sched, + "pretrained": args.pretrained, + } + # W&B if args.wb: import wandb @@ -325,28 +350,27 @@ def main(args): run = wandb.init( name=exp_name, project="character-classification", - config={ - "learning_rate": args.lr, - "epochs": args.epochs, - "weight_decay": args.weight_decay, - "batch_size": args.batch_size, - "architecture": args.arch, - "input_size": args.input_size, - "optimizer": args.optim, - "framework": "pytorch", - "vocab": args.vocab, - "scheduler": args.sched, - "pretrained": args.pretrained, - }, + config=config, ) + # ClearML + if args.clearml: + from clearml import Task + + task = Task.init(project_name="docTR/character-classification", task_name=exp_name, reuse_last_task_id=False) + task.upload_artifact("config", config) + global iteration + iteration = 0 + # Create loss queue min_loss = np.inf # Training loop if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler) + fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + ) # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) @@ -361,6 +385,15 @@ def main(args): "val_loss": val_loss, "acc": acc, }) + + # ClearML + if args.clearml: + from clearml import Logger + + logger = Logger.current_logger() + logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break @@ -420,6 +453,7 @@ def parse_args(): "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" ) parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML") parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") parser.add_argument( "--pretrained", diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index b9e0b1892..dbec4886b 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -121,11 +121,16 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -152,6 +157,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + ) + iteration += 1 @torch.no_grad() @@ -324,6 +335,20 @@ def main(args): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + config = { + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": input_size, + "optimizer": args.optim, + "framework": "pytorch", + "classes": CLASSES, + "scheduler": args.sched, + "pretrained": args.pretrained, + } + # W&B if args.wb: import wandb @@ -331,28 +356,27 @@ def main(args): run = wandb.init( name=exp_name, project="orientation-classification", - config={ - "learning_rate": args.lr, - "epochs": args.epochs, - "weight_decay": args.weight_decay, - "batch_size": args.batch_size, - "architecture": args.arch, - "input_size": input_size, - "optimizer": args.optim, - "framework": "pytorch", - "classes": CLASSES, - "scheduler": args.sched, - "pretrained": args.pretrained, - }, + config=config, ) + # ClearML + if args.clearml: + from clearml import Task + + task = Task.init(project_name="docTR/orientation-classification", task_name=exp_name, reuse_last_task_id=False) + task.upload_artifact("config", config) + global iteration + iteration = 0 + # Create loss queue min_loss = np.inf # Training loop if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler) + fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + ) # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) @@ -367,6 +391,15 @@ def main(args): "val_loss": val_loss, "acc": acc, }) + + # ClearML + if args.clearml: + from clearml import Logger + + logger = Logger.current_logger() + logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break @@ -410,6 +443,7 @@ def parse_args(): "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" ) parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML") parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") parser.add_argument( "--pretrained", diff --git a/references/classification/train_tensorflow_character.py b/references/classification/train_tensorflow_character.py index f668e769d..5d7dfe728 100644 --- a/references/classification/train_tensorflow_character.py +++ b/references/classification/train_tensorflow_character.py @@ -96,7 +96,12 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -111,6 +116,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): apply_grads(optimizer, grads, model) pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration + ) + iteration += 1 def evaluate(model, val_loader, batch_transforms): @@ -315,6 +326,8 @@ def main(args): task = Task.init(project_name="docTR/character-classification", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) + global iteration + iteration = 0 # Create loss queue min_loss = np.inf @@ -323,7 +336,7 @@ def main(args): if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) @@ -346,6 +359,7 @@ def main(args): logger = Logger.current_logger() logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index 731b11ae1..b74a4be77 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -110,7 +110,12 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -125,6 +130,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): apply_grads(optimizer, grads, model) pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration + ) + iteration += 1 def evaluate(model, val_loader, batch_transforms): @@ -324,6 +335,8 @@ def main(args): task = Task.init(project_name="docTR/orientation-classification", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) + global iteration + iteration = 0 # Create loss queue min_loss = np.inf @@ -332,7 +345,7 @@ def main(args): if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) @@ -355,6 +368,7 @@ def main(args): logger = Logger.current_logger() logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 2729d64f3..6b0e9b692 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -103,11 +103,16 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -135,6 +140,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + ) + iteration += 1 @torch.no_grad() @@ -369,6 +380,22 @@ def main(args): # Training monitoring current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + config = { + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": args.input_size, + "optimizer": args.optim, + "framework": "pytorch", + "scheduler": args.sched, + "train_hash": train_hash, + "val_hash": val_hash, + "pretrained": args.pretrained, + "rotation": args.rotation, + "amp": args.amp, + } # W&B if args.wb: @@ -377,24 +404,18 @@ def main(args): run = wandb.init( name=exp_name, project="text-detection", - config={ - "learning_rate": args.lr, - "epochs": args.epochs, - "weight_decay": args.weight_decay, - "batch_size": args.batch_size, - "architecture": args.arch, - "input_size": args.input_size, - "optimizer": args.optim, - "framework": "pytorch", - "scheduler": args.sched, - "train_hash": train_hash, - "val_hash": val_hash, - "pretrained": args.pretrained, - "rotation": args.rotation, - "amp": args.amp, - }, + config=config, ) + # ClearML + if args.clearml: + from clearml import Task + + task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False) + task.upload_artifact("config", config) + global iteration + iteration = 0 + # Create loss queue min_loss = np.inf if args.early_stop: @@ -402,7 +423,9 @@ def main(args): # Training loop for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + ) # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) if val_loss < min_loss: @@ -426,6 +449,17 @@ def main(args): "precision": precision, "mean_iou": mean_iou, }) + + # ClearML + if args.clearml: + from clearml import Logger + + logger = Logger.current_logger() + logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Recall", series="recall", value=recall, iteration=epoch) + logger.report_scalar(title="Precision", series="precision", value=precision, iteration=epoch) + logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break @@ -468,6 +502,7 @@ def parse_args(): "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" ) parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML") parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") parser.add_argument( "--pretrained", diff --git a/references/detection/train_pytorch_ddp.py b/references/detection/train_pytorch_ddp.py index b41cdeec4..a96a6dc3b 100644 --- a/references/detection/train_pytorch_ddp.py +++ b/references/detection/train_pytorch_ddp.py @@ -109,11 +109,16 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -141,6 +146,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + ) + iteration += 1 @torch.no_grad() @@ -386,29 +397,41 @@ def main(rank: int, world_size: int, args): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + if rank == 0: + config = { + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": args.input_size, + "optimizer": args.optim, + "framework": "pytorch", + "scheduler": args.sched, + "train_hash": train_hash, + "val_hash": val_hash, + "pretrained": args.pretrained, + "rotation": args.rotation, + "amp": args.amp, + } + # W&B if rank == 0 and args.wb: run = wandb.init( name=exp_name, project="text-detection", - config={ - "learning_rate": args.lr, - "epochs": args.epochs, - "weight_decay": args.weight_decay, - "batch_size": args.batch_size, - "architecture": args.arch, - "input_size": args.input_size, - "optimizer": args.optim, - "framework": "pytorch", - "scheduler": args.sched, - "train_hash": train_hash, - "val_hash": val_hash, - "pretrained": args.pretrained, - "rotation": args.rotation, - "amp": args.amp, - }, + config=config, ) + # ClearML + if rank == 0 and args.clearml: + from clearml import Task + + task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False) + task.upload_artifact("config", config) + global iteration + iteration = 0 + # Create loss queue min_loss = np.inf if args.early_stop: @@ -416,7 +439,9 @@ def main(rank: int, world_size: int, args): # Training loop for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + ) if rank == 0: # Validation loop at the end of each epoch @@ -444,6 +469,17 @@ def main(rank: int, world_size: int, args): "precision": precision, "mean_iou": mean_iou, }) + + # ClearML + if args.clearml: + from clearml import Logger + + logger = Logger.current_logger() + logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Precision Recall", series="recall", value=recall, iteration=epoch) + logger.report_scalar(title="Precision Recall", series="precision", value=precision, iteration=epoch) + logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break @@ -491,6 +527,7 @@ def parse_args(): "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" ) parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML") parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") parser.add_argument( "--pretrained", diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 8b2e44a46..fb7d78b48 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -96,9 +96,14 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): train_iter = iter(train_loader) # Iterate over the batches of the dataset + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + pbar = tqdm(train_iter, position=1) for images, targets in pbar: images = batch_transforms(images) @@ -111,6 +116,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): apply_grads(optimizer, grads, model) pbar.set_description(f"Training loss: {train_loss.numpy():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.numpy(), iteration=iteration + ) + iteration += 1 def evaluate(model, val_loader, batch_transforms, val_metric): @@ -363,6 +374,8 @@ def main(args): task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) + global iteration + iteration = 0 if args.freeze_backbone: for layer in model.feat_extractor.layers: @@ -374,7 +387,7 @@ def main(args): # Training loop for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric) if val_loss < min_loss: @@ -408,6 +421,7 @@ def main(args): logger.report_scalar(title="Precision Recall", series="recall", value=recall, iteration=epoch) logger.report_scalar(title="Precision Recall", series="precision", value=precision, iteration=epoch) logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break diff --git a/references/recognition/train_pytorch.py b/references/recognition/train_pytorch.py index 0f69151ed..70a841dce 100644 --- a/references/recognition/train_pytorch.py +++ b/references/recognition/train_pytorch.py @@ -109,11 +109,16 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -143,6 +148,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + ) + iteration += 1 @torch.no_grad() @@ -376,6 +387,22 @@ def main(args): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + config = { + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": args.input_size, + "optimizer": args.optim, + "framework": "pytorch", + "scheduler": args.sched, + "vocab": args.vocab, + "train_hash": train_hash, + "val_hash": val_hash, + "pretrained": args.pretrained, + } + # W&B if args.wb: import wandb @@ -383,30 +410,27 @@ def main(args): run = wandb.init( name=exp_name, project="text-recognition", - config={ - "learning_rate": args.lr, - "epochs": args.epochs, - "weight_decay": args.weight_decay, - "batch_size": args.batch_size, - "architecture": args.arch, - "input_size": args.input_size, - "optimizer": args.optim, - "framework": "pytorch", - "scheduler": args.sched, - "vocab": args.vocab, - "train_hash": train_hash, - "val_hash": val_hash, - "pretrained": args.pretrained, - }, + config=config, ) + # ClearML + if args.clearml: + from clearml import Task + + task = Task.init(project_name="docTR/text-recognition", task_name=exp_name, reuse_last_task_id=False) + task.upload_artifact("config", config) + global iteration + iteration = 0 + # Create loss queue min_loss = np.inf # Training loop if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + ) # Validation loop at the end of each epoch val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) @@ -425,6 +449,16 @@ def main(args): "exact_match": exact_match, "partial_match": partial_match, }) + + # ClearML + if args.clearml: + from clearml import Logger + + logger = Logger.current_logger() + logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Exact Match", series="exact_match", value=exact_match, iteration=epoch) + logger.report_scalar(title="Partial Match", series="partial_match", value=exact_match, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break @@ -482,6 +516,7 @@ def parse_args(): "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" ) parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML") parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") parser.add_argument( "--pretrained", diff --git a/references/recognition/train_pytorch_ddp.py b/references/recognition/train_pytorch_ddp.py index 19e9b1a1c..3d1b95b56 100644 --- a/references/recognition/train_pytorch_ddp.py +++ b/references/recognition/train_pytorch_ddp.py @@ -42,11 +42,16 @@ from utils import EarlyStopper, plot_samples -def fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, scheduler, amp=False): +def fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_loader, position=1) for images, targets in pbar: @@ -75,6 +80,12 @@ def fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, sche scheduler.step() pbar.set_description(f"Training loss: {train_loss.item():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration + ) + iteration += 1 @torch.no_grad() @@ -304,35 +315,50 @@ def main(rank: int, world_size: int, args): current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") exp_name = f"{args.arch}_{current_time}" if args.name is None else args.name + if rank == 0: + config = { + "learning_rate": args.lr, + "epochs": args.epochs, + "weight_decay": args.weight_decay, + "batch_size": args.batch_size, + "architecture": args.arch, + "input_size": args.input_size, + "optimizer": args.optim, + "framework": "pytorch", + "scheduler": args.sched, + "train_hash": train_hash, + "val_hash": val_hash, + "pretrained": args.pretrained, + "rotation": args.rotation, + "amp": args.amp, + } + # W&B if rank == 0 and args.wb: run = wandb.init( name=exp_name, project="text-recognition", - config={ - "learning_rate": args.lr, - "epochs": args.epochs, - "weight_decay": args.weight_decay, - "batch_size": args.batch_size, - "architecture": args.arch, - "input_size": args.input_size, - "optimizer": args.optim, - "framework": "pytorch", - "scheduler": args.sched, - "vocab": args.vocab, - "train_hash": train_hash, - "val_hash": val_hash, - "pretrained": args.pretrained, - }, + config=config, ) + # ClearML + if rank == 0 and args.clearml: + from clearml import Task + + task = Task.init(project_name="docTR/text-recognition", task_name=exp_name, reuse_last_task_id=False) + task.upload_artifact("config", config) + global iteration + iteration = 0 + # Create loss queue min_loss = np.inf if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) # Training loop for epoch in range(args.epochs): - fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + fit_one_epoch( + model, device, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + ) if rank == 0: # Validation loop at the end of each epoch @@ -357,6 +383,18 @@ def main(rank: int, world_size: int, args): "exact_match": exact_match, "partial_match": partial_match, }) + + # ClearML + if args.clearml: + from clearml import Logger + + logger = Logger.current_logger() + logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Exact Match", series="exact_match", value=exact_match, iteration=epoch) + logger.report_scalar( + title="Partial Match", series="partial_match", value=partial_match, iteration=epoch + ) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break @@ -418,6 +456,7 @@ def parse_args(): "--show-samples", dest="show_samples", action="store_true", help="Display unormalized training samples" ) parser.add_argument("--wb", dest="wb", action="store_true", help="Log to Weights & Biases") + parser.add_argument("--clearml", dest="clearml", action="store_true", help="Log to ClearML") parser.add_argument("--push-to-hub", dest="push_to_hub", action="store_true", help="Push to Huggingface Hub") parser.add_argument( "--pretrained", diff --git a/references/recognition/train_tensorflow.py b/references/recognition/train_tensorflow.py index b843b832c..d9748a74f 100644 --- a/references/recognition/train_tensorflow.py +++ b/references/recognition/train_tensorflow.py @@ -96,8 +96,13 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): train_iter = iter(train_loader) + if clearml_log: + from clearml import Logger + + logger = Logger.current_logger() + # Iterate over the batches of the dataset pbar = tqdm(train_iter, position=1) for images, targets in pbar: @@ -111,6 +116,12 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): apply_grads(optimizer, grads, model) pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") + if clearml_log: + global iteration + logger.report_scalar( + title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration + ) + iteration += 1 def evaluate(model, val_loader, batch_transforms, val_metric): @@ -369,6 +380,8 @@ def main(args): task = Task.init(project_name="docTR/text-recognition", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) + global iteration + iteration = 0 # Backbone freezing if args.freeze_backbone: @@ -380,7 +393,7 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) # Training loop for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) # Validation loop at the end of each epoch val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric) @@ -408,6 +421,7 @@ def main(args): logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) logger.report_scalar(title="Exact Match", series="exact_match", value=exact_match, iteration=epoch) logger.report_scalar(title="Partial Match", series="partial_match", value=partial_match, iteration=epoch) + if args.early_stop and early_stopper.early_stop(val_loss): print("Training halted early due to reaching patience limit.") break