diff options
-rw-r--r-- | config.py | 6 | ||||
-rw-r--r-- | models/auto_encoder.py | 15 | ||||
-rw-r--r-- | models/model.py | 49 | ||||
-rw-r--r-- | models/rgb_part_net.py | 8 |
4 files changed, 44 insertions, 34 deletions
@@ -5,7 +5,7 @@ config: Configuration = { # Disable accelerator 'disable_acc': False, # GPU(s) used in training or testing if available - 'CUDA_VISIBLE_DEVICES': '0', + 'CUDA_VISIBLE_DEVICES': '0,1', # Directory used in training or testing for temporary storage 'save_dir': 'runs', # Recorde disentangled image or not @@ -30,14 +30,14 @@ config: Configuration = { # Resolution after resize, can be divided 16 'frame_size': (64, 48), # Cache dataset or not - 'cache_on': False, + 'cache_on': True, }, # Dataloader settings 'dataloader': { # Batch size (pr, k) # `pr` denotes number of persons # `k` denotes number of sequences per person - 'batch_size': (4, 6), + 'batch_size': (6, 8), # Number of workers of Dataloader 'num_workers': 4, # Faster data transfer from RAM to GPU if enabled diff --git a/models/auto_encoder.py b/models/auto_encoder.py index 4fece69..0694ff1 100644 --- a/models/auto_encoder.py +++ b/models/auto_encoder.py @@ -151,27 +151,18 @@ class AutoEncoder(nn.Module): x_c1_t2_pred_ = self.decoder(f_a_c1_t1_, f_c_c1_t1_, f_p_c1_t2_) x_c1_t2_pred = x_c1_t2_pred_.view(n, t, c, h, w) - xrecon_loss = torch.stack([ - F.mse_loss(x_c1_t2[:, i, :, :, :], x_c1_t2_pred[:, i, :, :, :]) - for i in range(t) - ]).sum() - f_c_c1_t1 = f_c_c1_t1_.view(n, t, -1) f_c_c1_t2 = f_c_c1_t2_.view(n, t, -1) f_c_c2_t2 = f_c_c2_t2_.view(n, t, -1) - cano_cons_loss = torch.stack([ - F.mse_loss(f_c_c1_t1[:, i, :], f_c_c1_t2[:, i, :]) - + F.mse_loss(f_c_c1_t2[:, i, :], f_c_c2_t2[:, i, :]) - for i in range(t) - ]).mean() f_p_c1_t2 = f_p_c1_t2_.view(n, t, -1) f_p_c2_t2 = f_p_c2_t2_.view(n, t, -1) - pose_sim_loss = F.mse_loss(f_p_c1_t2.mean(1), f_p_c2_t2.mean(1)) return ( (f_a_c1_t2_, f_c_c1_t2_, f_p_c1_t2_), - (xrecon_loss, cano_cons_loss, pose_sim_loss * 10) + (x_c1_t2_pred, + (f_c_c1_t1, f_c_c1_t2, f_c_c2_t2), + (f_p_c1_t2, f_p_c2_t2)) ) else: # evaluating return f_c_c1_t2_, f_p_c1_t2_ diff --git a/models/model.py b/models/model.py index b09d600..bf79564 100644 --- a/models/model.py +++ b/models/model.py @@ -177,18 +177,21 @@ class Model: triplet_is_hard, triplet_is_mean, None ) + num_sampled_frames = dataset_config.get('num_sampled_frames', 30) num_pairs = (self.pr*self.k-1) * (self.pr*self.k) // 2 num_pos_pairs = (self.k*(self.k-1)//2) * self.pr # Try to accelerate computation using CUDA or others + self.rgb_pn = nn.DataParallel(self.rgb_pn) self.rgb_pn = self.rgb_pn.to(self.device) + self.triplet_loss_hpm = nn.DataParallel(self.triplet_loss_hpm) self.triplet_loss_hpm = self.triplet_loss_hpm.to(self.device) + self.triplet_loss_pn = nn.DataParallel(self.triplet_loss_pn) self.triplet_loss_pn = self.triplet_loss_pn.to(self.device) - self.optimizer = optim.Adam([ - {'params': self.rgb_pn.ae.parameters(), **ae_optim_hp}, - {'params': self.rgb_pn.hpm.parameters(), **hpm_optim_hp}, - {'params': self.rgb_pn.pn.parameters(), **pn_optim_hp}, + {'params': self.rgb_pn.module.ae.parameters(), **ae_optim_hp}, + {'params': self.rgb_pn.module.hpm.parameters(), **hpm_optim_hp}, + {'params': self.rgb_pn.module.pn.parameters(), **pn_optim_hp}, ], **optim_hp) start_step = sched_hp.get('start_step', 15_000) @@ -241,20 +244,34 @@ class Model: # forward + backward + optimize x_c1 = batch_c1['clip'].to(self.device) x_c2 = batch_c2['clip'].to(self.device) - embed_c, embed_p, ae_losses, images = self.rgb_pn(x_c1, x_c2) + embed_c, embed_p, images, feature_for_loss = self.rgb_pn(x_c1, x_c2) + x_c1_pred = feature_for_loss[0] + xrecon_loss = torch.stack([ + F.mse_loss(x_c1_pred[:, i, :, :, :], x_c1[:, i, :, :, :]) + for i in range(num_sampled_frames) + ]).sum() + f_c_c1_t1, f_c_c1_t2, f_c_c2_t2 = feature_for_loss[1] + cano_cons_loss = torch.stack([ + F.mse_loss(f_c_c1_t1[:, i, :], f_c_c1_t2[:, i, :]) + + F.mse_loss(f_c_c1_t2[:, i, :], f_c_c2_t2[:, i, :]) + for i in range(num_sampled_frames) + ]).mean() + f_p_c1_t2, f_p_c2_t2 = feature_for_loss[2] + pose_sim_loss = F.mse_loss( + f_p_c1_t2.mean(1), f_p_c2_t2.mean(1) + ) * 10 y = batch_c1['label'].to(self.device) # Duplicate labels for each part - y = y.repeat(self.rgb_pn.num_parts, 1) + y = y.repeat(self.rgb_pn.module.num_parts, 1) trip_loss_hpm, hpm_dist, hpm_num_non_zero = self.triplet_loss_hpm( - embed_c, y[:self.rgb_pn.hpm.num_parts] + embed_c.transpose(0, 1), y[:self.rgb_pn.module.hpm.num_parts] ) trip_loss_pn, pn_dist, pn_num_non_zero = self.triplet_loss_pn( - embed_p, y[self.rgb_pn.hpm.num_parts:] + embed_p.transpose(0, 1), y[self.rgb_pn.module.hpm.num_parts:] ) losses = torch.stack(( - *ae_losses, - trip_loss_hpm.mean(), - trip_loss_pn.mean() + xrecon_loss, cano_cons_loss, pose_sim_loss, + trip_loss_hpm.mean(), trip_loss_pn.mean() )) loss = losses.sum() loss.backward() @@ -264,10 +281,11 @@ class Model: running_loss += losses.detach() # Write losses to TensorBoard self.writer.add_scalar('Loss/all', loss, self.curr_iter) - self.writer.add_scalars('Loss/disentanglement', dict(zip(( - 'Cross reconstruction loss', 'Canonical consistency loss', - 'Pose similarity loss' - ), ae_losses)), self.curr_iter) + self.writer.add_scalars('Loss/disentanglement', { + 'Cross reconstruction loss': xrecon_loss, + 'Canonical consistency loss': cano_cons_loss, + 'Pose similarity loss': pose_sim_loss + }, self.curr_iter) self.writer.add_scalars('Loss/triplet loss', { 'HPM': losses[3], 'PartNet': losses[4] @@ -411,6 +429,7 @@ class Model: model_hp.pop('triplet_margins', None) self.rgb_pn = RGBPartNet(self.in_channels, self.in_size, **model_hp) # Try to accelerate computation using CUDA or others + self.rgb_pn = nn.DataParallel(self.rgb_pn) self.rgb_pn = self.rgb_pn.to(self.device) self.rgb_pn.eval() diff --git a/models/rgb_part_net.py b/models/rgb_part_net.py index c38a567..fdeed17 100644 --- a/models/rgb_part_net.py +++ b/models/rgb_part_net.py @@ -42,7 +42,7 @@ class RGBPartNet(nn.Module): def forward(self, x_c1, x_c2=None): # Step 1: Disentanglement # n, t, c, h, w - ((x_c, x_p), ae_losses, images) = self._disentangle(x_c1, x_c2) + ((x_c, x_p), images, f_loss) = self._disentangle(x_c1, x_c2) # Step 2.a: Static Gait Feature Aggregation & HPM # n, c, h, w @@ -55,7 +55,7 @@ class RGBPartNet(nn.Module): # p, n, d if self.training: - return x_c, x_p, ae_losses, images + return x_c.transpose(0, 1), x_p.transpose(0, 1), images, f_loss else: return torch.cat((x_c, x_p)).unsqueeze(1).view(-1) @@ -64,7 +64,7 @@ class RGBPartNet(nn.Module): device = x_c1_t2.device if self.training: x_c1_t1 = x_c1_t2[:, torch.randperm(t), :, :, :] - ((f_a_, f_c_, f_p_), losses) = self.ae(x_c1_t2, x_c1_t1, x_c2_t2) + (f_a_, f_c_, f_p_), f_loss = self.ae(x_c1_t2, x_c1_t1, x_c2_t2) # Decode features x_c = self._decode_cano_feature(f_c_, n, t, device) x_p_ = self._decode_pose_feature(f_p_, n, t, device) @@ -81,7 +81,7 @@ class RGBPartNet(nn.Module): i_p_ = torch.sigmoid(self.ae.decoder.trans_conv4(i_p_)) i_p = i_p_.view(n, t, c, h, w) - return (x_c, x_p), losses, (i_a, i_c, i_p) + return (x_c, x_p), (i_a, i_c, i_p), f_loss else: # evaluating f_c_, f_p_ = self.ae(x_c1_t2) |