summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan Gong <jordan.gong@protonmail.com>2021-05-17 17:19:20 +0800
committerJordan Gong <jordan.gong@protonmail.com>2021-05-17 17:19:20 +0800
commit2b5bc2350df8ebb8e4e5722f5563f2e44823ac34 (patch)
tree94816e528ed7ac1732c44877181540fd5a8ad810
parentfde8d8fc0e4d986be37b26b468b0ac9d53d7bcd9 (diff)
Model modificationHEADmaster
Reduce channels in the auto-encoder and add more layers
-rw-r--r--config.py8
-rw-r--r--models/auto_encoder.py86
-rw-r--r--models/model.py2
3 files changed, 61 insertions, 35 deletions
diff --git a/config.py b/config.py
index 726d06e..4f6d248 100644
--- a/config.py
+++ b/config.py
@@ -51,20 +51,20 @@ config: Configuration = {
'hyperparameter': {
'model': {
# Auto-encoder feature channels coefficient
- 'ae_feature_channels': 64,
+ 'ae_feature_channels': 32,
# Appearance, canonical and pose feature dimensions
- 'f_a_c_p_dims': (192, 192, 128),
+ 'f_a_c_p_dims': (48, 48, 32),
# HPM pyramid scales, of which sum is number of parts
'hpm_scales': (1, 2, 4, 8),
# Global pooling method
'hpm_use_avg_pool': True,
'hpm_use_max_pool': True,
# Number of parts after Part Net
- 'tfa_num_parts': 16,
+ 'tfa_num_parts': 8,
# Attention squeeze ratio
'tfa_squeeze_ratio': 4,
# Embedding dimensions for each part
- 'embedding_dims': (256, 256),
+ 'embedding_dims': (32, 64),
# Batch Hard or Batch All
'triplet_is_hard': True,
# Use non-zero mean or sum
diff --git a/models/auto_encoder.py b/models/auto_encoder.py
index dc7843a..aa55a42 100644
--- a/models/auto_encoder.py
+++ b/models/auto_encoder.py
@@ -12,42 +12,53 @@ class Encoder(nn.Module):
self,
in_channels: int = 3,
frame_size: tuple[int, int] = (64, 48),
- feature_channels: int = 64,
- output_dims: tuple[int, int, int] = (192, 192, 128)
+ feature_channels: int = 32,
+ output_dims: tuple[int, int, int] = (48, 48, 32)
):
super().__init__()
h_0, w_0 = frame_size
h_1, w_1 = h_0 // 2, w_0 // 2
- h_2, w_2 = h_1 // 2, w_1 // 2
+ h_2, w_2 = h_1 // 4, w_1 // 4
# Appearance features, canonical features, pose features
(self.f_a_dim, self.f_c_dim, self.f_p_dim) = output_dims
# Conv1 in_channels x H x W
# -> feature_map_size x H x W
- self.conv1 = VGGConv2d(in_channels, feature_channels)
+ self.conv1 = VGGConv2d(in_channels,
+ feature_channels,
+ kernel_size=5,
+ padding=2)
+ # Conv2 feature_map_size x H x W
+ # -> feature_map_size x H x W
+ self.conv2 = VGGConv2d(feature_channels, feature_channels)
# MaxPool1 feature_map_size x H x W
# -> feature_map_size x H//2 x W//2
self.max_pool1 = nn.AdaptiveMaxPool2d((h_1, w_1))
- # Conv2 feature_map_size x H//2 x W//2
- # -> feature_map_size*4 x H//2 x W//2
- self.conv2 = VGGConv2d(feature_channels, feature_channels * 4)
- # MaxPool2 feature_map_size*4 x H//2 x W//2
- # -> feature_map_size*4 x H//4 x W//4
+ # Conv3 feature_map_size x H//2 x W//2
+ # -> feature_map_size*2 x H//2 x W//2
+ self.conv3 = VGGConv2d(feature_channels, feature_channels * 2)
+ # Conv4 feature_map_size*2 x H//2 x W//2
+ # -> feature_map_size*2 x H//2 x W//2
+ self.conv4 = VGGConv2d(feature_channels * 2, feature_channels * 2)
+ # MaxPool2 feature_map_size*2 x H//2 x W//2
+ # -> feature_map_size*2 x H//8 x W//8
self.max_pool2 = nn.AdaptiveMaxPool2d((h_2, w_2))
- # Conv3 feature_map_size*4 x H//4 x W//4
- # -> feature_map_size*8 x H//4 x W//4
- self.conv3 = VGGConv2d(feature_channels * 4, feature_channels * 8)
- # Conv4 feature_map_size*8 x H//4 x W//4
- # -> feature_map_size*8 x H//4 x W//4 (for large dataset)
- self.conv4 = VGGConv2d(feature_channels * 8, feature_channels * 8)
+ # Conv5 feature_map_size*2 x H//8 x W//8
+ # -> feature_map_size*4 x H//8 x W//8
+ self.conv5 = VGGConv2d(feature_channels * 2, feature_channels * 4)
+ # Conv6 feature_map_size*4 x H//8 x W//8
+ # -> feature_map_size*4 x H//8 x W//8
+ self.conv6 = VGGConv2d(feature_channels * 4, feature_channels * 4)
def forward(self, x):
x = self.conv1(x)
- x = self.max_pool1(x)
x = self.conv2(x)
- x = self.max_pool2(x)
+ x = self.max_pool1(x)
x = self.conv3(x)
x = self.conv4(x)
+ x = self.max_pool2(x)
+ x = self.conv5(x)
+ x = self.conv6(x)
f_appearance, f_canonical, f_pose = x.split(
(self.f_a_dim, self.f_c_dim, self.f_p_dim), dim=1
)
@@ -59,41 +70,56 @@ class Decoder(nn.Module):
def __init__(
self,
- feature_channels: int = 64,
+ feature_channels: int = 32,
out_channels: int = 3,
):
super().__init__()
self.feature_channels = feature_channels
- # TransConv1 feature_map_size*8 x H x W
+ # TransConv1 feature_map_size*4 x H x W
# -> feature_map_size*4 x H x W
- self.trans_conv1 = DCGANConvTranspose2d(feature_channels * 8,
+ self.trans_conv1 = DCGANConvTranspose2d(feature_channels * 4,
feature_channels * 4,
kernel_size=3,
stride=1,
padding=1)
- # TransConv2 feature_map_size*4 x H x W
+ # TransConv2 feature_map_size*4 x H x W
# -> feature_map_size*2 x H*2 x W*2
self.trans_conv2 = DCGANConvTranspose2d(feature_channels * 4,
feature_channels * 2)
# TransConv3 feature_map_size*2 x H*2 x W*2
- # -> feature_map_size x H*2 x W*2
+ # -> feature_map_size*2 x H*2 x W*2
self.trans_conv3 = DCGANConvTranspose2d(feature_channels * 2,
+ feature_channels * 2,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ # TransConv4 feature_map_size*2 x H*2 x W*2
+ # -> feature_map_size x H*4 x W*4
+ self.trans_conv4 = DCGANConvTranspose2d(feature_channels * 2,
+ feature_channels)
+
+ # TransConv5 feature_map_size x H*4 x W*4
+ # -> feature_map_size x H*4 x W*4
+ self.trans_conv5 = DCGANConvTranspose2d(feature_channels,
feature_channels,
kernel_size=3,
stride=1,
padding=1)
- # TransConv4 feature_map_size x H*2 x W*2
- # -> in_channels x H*4 x W*4
- self.trans_conv4 = DCGANConvTranspose2d(feature_channels, out_channels,
- is_last_layer=True)
+
+ # TransConv6 feature_map_size x H*4 x W*4
+ # -> out_channels x H*8 x W*8
+ self.trans_conv6 = DCGANConvTranspose2d(feature_channels,
+ out_channels)
def forward(self, f_appearance, f_canonical, f_pose):
x = torch.cat((f_appearance, f_canonical, f_pose), dim=1)
x = self.trans_conv1(x)
x = self.trans_conv2(x)
x = self.trans_conv3(x)
- x = torch.sigmoid(self.trans_conv4(x))
+ x = self.trans_conv4(x)
+ x = self.trans_conv5(x)
+ x = torch.sigmoid(self.trans_conv6(x))
return x
@@ -103,8 +129,8 @@ class AutoEncoder(nn.Module):
self,
channels: int = 3,
frame_size: tuple[int, int] = (64, 48),
- feature_channels: int = 64,
- embedding_dims: tuple[int, int, int] = (192, 192, 128)
+ feature_channels: int = 32,
+ embedding_dims: tuple[int, int, int] = (48, 48, 32)
):
super().__init__()
self.embedding_dims = embedding_dims
@@ -118,7 +144,7 @@ class AutoEncoder(nn.Module):
x_c1_t2_ = x_c1_t2.view(n * t, c, h, w)
(f_a_c1_t2_, f_c_c1_t2_, f_p_c1_t2_) = self.encoder(x_c1_t2_)
- f_size = [torch.Size([n, t, embedding_dim, h // 4, w // 4])
+ f_size = [torch.Size([n, t, embedding_dim, h // 8, w // 8])
for embedding_dim in self.embedding_dims]
f_a_c1_t2 = f_a_c1_t2_.view(f_size[0])
f_c_c1_t2 = f_c_c1_t2_.view(f_size[1])
diff --git a/models/model.py b/models/model.py
index 6118bdf..bf23428 100644
--- a/models/model.py
+++ b/models/model.py
@@ -366,7 +366,7 @@ class Model:
for _f_p_c1_t2, _f_p_c2_t2 in zip(*f_loss[2])
]).sum()
- return xrecon_loss, cano_cons_loss * 10, pose_sim_loss * 100
+ return xrecon_loss / 10, cano_cons_loss, pose_sim_loss * 10
def _classification_loss(self, embedding, y):
# Duplicate labels for each part