diff options
author | Jordan Gong <jordan.gong@protonmail.com> | 2021-02-14 20:50:34 +0800 |
---|---|---|
committer | Jordan Gong <jordan.gong@protonmail.com> | 2021-02-14 20:50:34 +0800 |
commit | 10944fda51563b66cf441747f7a1b292236096cf (patch) | |
tree | 88c7ac4d1e23d39b6f19c79b701c628a3e304361 | |
parent | 156fb6d957efda8b897c172d70eccc0d2016b2bf (diff) | |
parent | 34d2f9017e77a7bdef761ab3d92cd0340c5154c3 (diff) |
Merge branch 'python3.8' into python3.7
-rw-r--r-- | eval.py | 2 | ||||
-rw-r--r-- | models/auto_encoder.py | 70 | ||||
-rw-r--r-- | models/model.py | 23 | ||||
-rw-r--r-- | models/rgb_part_net.py | 115 | ||||
-rw-r--r-- | utils/triplet_loss.py | 2 |
5 files changed, 121 insertions, 91 deletions
@@ -22,7 +22,7 @@ for n in range(rank): print(f'===Rank-{n + 1} Accuracy===') for (condition, accuracy_c) in accuracy.items(): acc_excl_identical_view = accuracy_c[:, :, n].fill_diagonal_(0) - num_gallery_views = (acc_excl_identical_view != 0).sum() + num_gallery_views = (acc_excl_identical_view != 0).sum(0) acc_each_angle = acc_excl_identical_view.sum(0) / num_gallery_views print('{0}: {1} mean: {2:5.2f}'.format( condition, acc_each_angle.cpu().numpy() * 100, diff --git a/models/auto_encoder.py b/models/auto_encoder.py index 7b9b29f..918a95c 100644 --- a/models/auto_encoder.py +++ b/models/auto_encoder.py @@ -119,32 +119,47 @@ class AutoEncoder(nn.Module): embedding_dims: Tuple[int, int, int] = (128, 128, 64) ): super().__init__() + self.f_c_c1_t2_ = None + self.f_p_c1_t2_ = None + self.f_c_c1_t1_ = None self.encoder = Encoder(channels, feature_channels, embedding_dims) self.decoder = Decoder(embedding_dims, feature_channels, channels) - def forward(self, x_c1_t2, x_c1_t1=None, x_c2_t2=None): - n, t, c, h, w = x_c1_t2.size() - # x_c1_t2 is the frame for later module - x_c1_t2_ = x_c1_t2.view(n * t, c, h, w) - (f_a_c1_t2_, f_c_c1_t2_, f_p_c1_t2_) = self.encoder(x_c1_t2_) - - if self.training: - # t1 is random time step, c2 is another condition - x_c1_t1 = x_c1_t1.view(n * t, c, h, w) - (f_a_c1_t1_, f_c_c1_t1_, _) = self.encoder(x_c1_t1) - x_c2_t2 = x_c2_t2.view(n * t, c, h, w) - (_, f_c_c2_t2_, f_p_c2_t2_) = self.encoder(x_c2_t2) - - x_c1_t2_pred_ = self.decoder(f_a_c1_t1_, f_c_c1_t1_, f_p_c1_t2_) - x_c1_t2_pred = x_c1_t2_pred_.view(n, t, c, h, w) - - xrecon_loss = torch.stack([ - F.mse_loss(x_c1_t2[:, i, :, :, :], x_c1_t2_pred[:, i, :, :, :]) - for i in range(t) - ]).sum() - - f_c_c1_t1 = f_c_c1_t1_.view(n, t, -1) - f_c_c1_t2 = f_c_c1_t2_.view(n, t, -1) + def forward(self, x_t2, is_c1=True): + n, t, c, h, w = x_t2.size() + if is_c1: # condition 1 + # x_c1_t2 is the frame for later module + x_c1_t2_ = x_t2.view(n * t, c, h, w) + (f_a_c1_t2_, self.f_c_c1_t2_, self.f_p_c1_t2_) \ + = self.encoder(x_c1_t2_) + + if self.training: + # t1 is random time step + x_c1_t1 = x_t2[:, torch.randperm(t), :, :, :] + x_c1_t1_ = x_c1_t1.view(n * t, c, h, w) + (f_a_c1_t1_, self.f_c_c1_t1_, _) = self.encoder(x_c1_t1_) + + x_c1_t2_pred_ = self.decoder( + f_a_c1_t1_, self.f_c_c1_t1_, self.f_p_c1_t2_ + ) + x_c1_t2_pred = x_c1_t2_pred_.view(n, t, c, h, w) + + xrecon_loss = torch.stack([ + F.mse_loss(x_t2[:, i, :, :, :], x_c1_t2_pred[:, i, :, :, :]) + for i in range(t) + ]).sum() + + return ((f_a_c1_t2_, self.f_c_c1_t2_, self.f_p_c1_t2_), + xrecon_loss) + else: # evaluating + return self.f_c_c1_t2_, self.f_p_c1_t2_ + else: # condition 2 + # c2 is another condition + x_c2_t2_ = x_t2.view(n * t, c, h, w) + (_, f_c_c2_t2_, f_p_c2_t2_) = self.encoder(x_c2_t2_) + + f_c_c1_t1 = self.f_c_c1_t1_.view(n, t, -1) + f_c_c1_t2 = self.f_c_c1_t2_.view(n, t, -1) f_c_c2_t2 = f_c_c2_t2_.view(n, t, -1) cano_cons_loss = torch.stack([ F.mse_loss(f_c_c1_t1[:, i, :], f_c_c1_t2[:, i, :]) @@ -152,13 +167,8 @@ class AutoEncoder(nn.Module): for i in range(t) ]).mean() - f_p_c1_t2 = f_p_c1_t2_.view(n, t, -1) + f_p_c1_t2 = self.f_p_c1_t2_.view(n, t, -1) f_p_c2_t2 = f_p_c2_t2_.view(n, t, -1) pose_sim_loss = F.mse_loss(f_p_c1_t2.mean(1), f_p_c2_t2.mean(1)) - return ( - (f_a_c1_t2_, f_c_c1_t2_, f_p_c1_t2_), - (xrecon_loss, cano_cons_loss, pose_sim_loss * 10) - ) - else: # evaluating - return f_c_c1_t2_, f_p_c1_t2_ + return cano_cons_loss, pose_sim_loss * 10 diff --git a/models/model.py b/models/model.py index eae15e3..199e371 100644 --- a/models/model.py +++ b/models/model.py @@ -179,7 +179,7 @@ class Model: # Training start start_time = datetime.now() running_loss = torch.zeros(5, device=self.device) - print(f"{'Time':^8} {'Iter':^5} {'Loss':^6}", + print(f"{'Time':^8} {'Iter':^5} {'Loss':^5}", f"{'Xrecon':^8} {'CanoCons':^8} {'PoseSim':^8}", f"{'BATripH':^8} {'BATripP':^8} {'LRs':^19}") for (batch_c1, batch_c2) in dataloader: @@ -187,10 +187,21 @@ class Model: # Zero the parameter gradients self.optimizer.zero_grad() # forward + backward + optimize + # Feed data twice in order to reduce memory usage x_c1 = batch_c1['clip'].to(self.device) - x_c2 = batch_c2['clip'].to(self.device) y = batch_c1['label'].to(self.device) - losses, images = self.rgb_pn(x_c1, x_c2, y) + # Duplicate labels for each part + y = y.unsqueeze(1).repeat(1, self.rgb_pn.num_total_parts) + # Feed condition 1 clips first + losses, images = self.rgb_pn(x_c1, y) + (xrecon_loss, hpm_ba_trip, pn_ba_trip) = losses + x_c2 = batch_c2['clip'].to(self.device) + # Then feed condition 2 clips + cano_cons_loss, pose_sim_loss = self.rgb_pn(x_c2, is_c1=False) + losses = torch.stack(( + xrecon_loss, cano_cons_loss, pose_sim_loss, + hpm_ba_trip, pn_ba_trip + )) loss = losses.sum() loss.backward() self.optimizer.step() @@ -220,7 +231,9 @@ class Model: self.writer.add_images( 'Canonical image', i_c, self.curr_iter ) - for (i, (o, a, p)) in enumerate(zip(x_c1, i_a, i_p)): + for (i, (o, a, p)) in enumerate(zip( + batch_c1['clip'], i_a, i_p + )): self.writer.add_images( f'Original image/batch {i}', o, self.curr_iter ) @@ -234,7 +247,7 @@ class Model: remaining_minute, second = divmod(time_used.seconds, 60) hour, minute = divmod(remaining_minute, 60) print(f'{hour:02}:{minute:02}:{second:02}', - f'{self.curr_iter:5d} {running_loss.sum() / 100:6.3f}', + f'{self.curr_iter:5d} {running_loss.sum() / 100:5.3f}', '{:f} {:f} {:f} {:f} {:f}'.format(*running_loss / 100), '{:.3e} {:.3e}'.format(lrs[0], lrs[1])) running_loss.zero_() diff --git a/models/rgb_part_net.py b/models/rgb_part_net.py index 841de96..c489ec6 100644 --- a/models/rgb_part_net.py +++ b/models/rgb_part_net.py @@ -46,7 +46,8 @@ class RGBPartNet(nn.Module): ae_feature_channels * 2, out_channels, hpm_use_1x1conv, hpm_scales, hpm_use_avg_pool, hpm_use_max_pool ) - empty_fc = torch.empty(self.hpm_num_parts + tfa_num_parts, + self.num_total_parts = self.hpm_num_parts + tfa_num_parts + empty_fc = torch.empty(self.num_total_parts, out_channels, embedding_dims) self.fc_mat = nn.Parameter(empty_fc) @@ -57,59 +58,67 @@ class RGBPartNet(nn.Module): def fc(self, x): return x @ self.fc_mat - def forward(self, x_c1, x_c2=None, y=None): - # Step 1: Disentanglement - # n, t, c, h, w - ((x_c, x_p), losses, images) = self._disentangle(x_c1, x_c2) - - # Step 2.a: Static Gait Feature Aggregation & HPM - # n, c, h, w - x_c = self.hpm(x_c) - # p, n, c - - # Step 2.b: FPFE & TFA (Dynamic Gait Feature Aggregation) - # n, t, c, h, w - x_p = self.pn(x_p) - # p, n, c - - # Step 3: Cat feature map together and fc - x = torch.cat((x_c, x_p)) - x = self.fc(x) - - if self.training: - hpm_ba_trip = self.hpm_ba_trip(x[:self.hpm_num_parts], y) - pn_ba_trip = self.pn_ba_trip(x[self.hpm_num_parts:], y) - losses = torch.stack((*losses, hpm_ba_trip, pn_ba_trip)) - return losses, images - else: - return x.unsqueeze(1).view(-1) - - def _disentangle(self, x_c1_t2, x_c2_t2=None): - n, t, c, h, w = x_c1_t2.size() - device = x_c1_t2.device - x_c1_t1 = x_c1_t2[:, torch.randperm(t), :, :, :] - if self.training: - ((f_a_, f_c_, f_p_), losses) = self.ae(x_c1_t2, x_c1_t1, x_c2_t2) - # Decode features - with torch.no_grad(): + def forward(self, x, y=None, is_c1=True): + # Step 1a: Disentangle condition 1 clips + if is_c1: + # n, t, c, h, w + ((x_c, x_p), xrecon_loss, images) = self._disentangle(x, is_c1) + + # Step 2.a: Static Gait Feature Aggregation & HPM + # n, c, h, w + x_c = self.hpm(x_c) + # p, n, c + + # Step 2.b: FPFE & TFA (Dynamic Gait Feature Aggregation) + # n, t, c, h, w + x_p = self.pn(x_p) + # p, n, c + + # Step 3: Cat feature map together and fc + x = torch.cat((x_c, x_p)) + x = self.fc(x) + + if self.training: + y = y.T + hpm_ba_trip = self.hpm_ba_trip( + x[:self.hpm_num_parts], y[:self.hpm_num_parts] + ) + pn_ba_trip = self.pn_ba_trip( + x[self.hpm_num_parts:], y[self.hpm_num_parts:] + ) + return (xrecon_loss, hpm_ba_trip, pn_ba_trip), images + else: # evaluating + return x.unsqueeze(1).view(-1) + else: # Step 1b: Disentangle condition 2 clips + return self._disentangle(x, is_c1) + + def _disentangle(self, x_t2, is_c1=True): + if is_c1: # condition 1 + n, t, *_ = x_size = x_t2.size() + device = x_t2.device + if self.training: + (f_a_, f_c_, f_p_), xrecon_loss = self.ae(x_t2, is_c1) + # Decode features + with torch.no_grad(): + x_c = self._decode_cano_feature(f_c_, n, t, device) + x_p = self._decode_pose_feature(f_p_, *x_size, device) + + i_a, i_c, i_p = None, None, None + if self.image_log_on: + i_a = self._decode_appr_feature(f_a_, *x_size, device) + # Continue decoding canonical features + i_c = self.ae.decoder.trans_conv3(x_c) + i_c = torch.sigmoid(self.ae.decoder.trans_conv4(i_c)) + i_p = x_p + + return (x_c, x_p), xrecon_loss, (i_a, i_c, i_p) + else: # evaluating + f_c_, f_p_ = self.ae(x_t2) x_c = self._decode_cano_feature(f_c_, n, t, device) - x_p = self._decode_pose_feature(f_p_, n, t, c, h, w, device) - - i_a, i_c, i_p = None, None, None - if self.image_log_on: - i_a = self._decode_appr_feature(f_a_, n, t, c, h, w, device) - # Continue decoding canonical features - i_c = self.ae.decoder.trans_conv3(x_c) - i_c = torch.sigmoid(self.ae.decoder.trans_conv4(i_c)) - i_p = x_p - - return (x_c, x_p), losses, (i_a, i_c, i_p) - - else: # evaluating - f_c_, f_p_ = self.ae(x_c1_t2) - x_c = self._decode_cano_feature(f_c_, n, t, device) - x_p = self._decode_pose_feature(f_p_, n, t, c, h, w, device) - return (x_c, x_p), None, None + x_p = self._decode_pose_feature(f_p_, *x_size, device) + return (x_c, x_p), None, None + else: # condition 2 + return self.ae(x_t2, is_c1) def _decode_appr_feature(self, f_a_, n, t, c, h, w, device): # Decode appearance features diff --git a/utils/triplet_loss.py b/utils/triplet_loss.py index d573ef4..954def2 100644 --- a/utils/triplet_loss.py +++ b/utils/triplet_loss.py @@ -9,9 +9,7 @@ class BatchAllTripletLoss(nn.Module): self.margin = margin def forward(self, x, y): - # Duplicate labels for each part p, n, c = x.size() - y = y.repeat(p, 1) # Euclidean distance p x n x n x_squared_sum = torch.sum(x ** 2, dim=2) |