From e7313d916d783744012ac7bb3011469d72803d25 Mon Sep 17 00:00:00 2001 From: Jordan Gong Date: Wed, 10 Aug 2022 19:54:37 +0800 Subject: Fix epoch scheduler problem --- supervised/baseline.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'supervised') diff --git a/supervised/baseline.py b/supervised/baseline.py index db93304..6072c10 100644 --- a/supervised/baseline.py +++ b/supervised/baseline.py @@ -231,15 +231,18 @@ class SupBaselineTrainer(Trainer): batch, num_batches, global_batch, iter_, num_iters, optim.param_groups[0]['lr'], train_loss.item() )) - metrics = torch.Tensor(list(self.eval(loss_fn, device))).mean(0) - eval_loss = metrics[0].item() - eval_accuracy = metrics[1].item() - epoch_log = self.EpochLogRecord(iter_, num_iters, eval_loss, eval_accuracy) - self.log(logger, epoch_log) - self.save_checkpoint(epoch_log) - # Step after save checkpoint, otherwise the schedular will one iter ahead after restore - if sched is not None: - sched.step() + if batch == loader_size - 1: + metrics = torch.Tensor(list(self.eval(loss_fn, device))).mean(0) + eval_loss = metrics[0].item() + eval_accuracy = metrics[1].item() + epoch_log = self.EpochLogRecord(iter_, num_iters, + eval_loss, eval_accuracy) + self.log(logger, epoch_log) + self.save_checkpoint(epoch_log) + # Step after save checkpoint, otherwise the schedular will + # one iter ahead after restore + if sched is not None: + sched.step() def eval(self, loss_fn: Callable, device: torch.device): model = self.models['model'] -- cgit v1.2.3