diff options
author | Jordan Gong <jordan.gong@protonmail.com> | 2021-02-18 18:38:31 +0800 |
---|---|---|
committer | Jordan Gong <jordan.gong@protonmail.com> | 2021-02-18 18:38:31 +0800 |
commit | 8012fce3e595aad31f4a52dc316b46e558234dff (patch) | |
tree | 4a5e19a21ad8a4470931f5884777c127197153c0 | |
parent | 2988c1b9afd4e869b629a8629abedbf63d2452aa (diff) | |
parent | 84a3d5991f2f7272d1be54ad6cfe6ce695f915a0 (diff) |
Merge branch 'master' into data_parallel_py3.8
-rw-r--r-- | config.py | 6 | ||||
-rw-r--r-- | models/auto_encoder.py | 81 | ||||
-rw-r--r-- | models/layers.py | 8 | ||||
-rw-r--r-- | models/model.py | 14 | ||||
-rw-r--r-- | models/rgb_part_net.py | 17 | ||||
-rw-r--r-- | preprocess.py | 2 |
6 files changed, 74 insertions, 54 deletions
@@ -16,7 +16,7 @@ config: Configuration = { # Name of dataset (CASIA-B or FVG) 'name': 'CASIA-B', # Path to dataset root (required) - 'root_dir': 'data/CASIA-B-MRCNN/SEG', + 'root_dir': 'data/CASIA-B-MRCNN-V2/SEG', # The number of subjects for training 'train_size': 74, # Number of sampled frames per sequence (Training only) @@ -27,8 +27,8 @@ config: Configuration = { 'discard_threshold': 15, # Number of input channels of model 'num_input_channels': 3, - # Resolution after resize, height : width should be 2 : 1 - 'frame_size': (64, 32), + # Resolution after resize, can be divided 16 + 'frame_size': (64, 48), # Cache dataset or not 'cache_on': False, }, diff --git a/models/auto_encoder.py b/models/auto_encoder.py index 7b9b29f..c6bc52f 100644 --- a/models/auto_encoder.py +++ b/models/auto_encoder.py @@ -13,39 +13,46 @@ class Encoder(nn.Module): def __init__( self, in_channels: int = 3, + frame_size: tuple[int, int] = (64, 48), feature_channels: int = 64, output_dims: Tuple[int, int, int] = (128, 128, 64) ): super().__init__() self.feature_channels = feature_channels + h_0, w_0 = frame_size + h_1, w_1 = h_0 // 2, w_0 // 2 + h_2, w_2 = h_1 // 2, w_1 // 2 + self.feature_size = self.h_3, self.w_3 = h_2 // 4, w_2 // 4 # Appearance features, canonical features, pose features (self.f_a_dim, self.f_c_dim, self.f_p_dim) = output_dims - # Conv1 in_channels x 64 x 32 - # -> feature_map_size x 64 x 32 + # Conv1 in_channels x H x W + # -> feature_map_size x H x W self.conv1 = VGGConv2d(in_channels, feature_channels) - # MaxPool1 feature_map_size x 64 x 32 - # -> feature_map_size x 32 x 16 - self.max_pool1 = nn.AdaptiveMaxPool2d((32, 16)) - # Conv2 feature_map_size x 32 x 16 - # -> (feature_map_size*4) x 32 x 16 + # MaxPool1 feature_map_size x H x W + # -> feature_map_size x H//2 x W//2 + self.max_pool1 = nn.AdaptiveMaxPool2d((h_1, w_1)) + # Conv2 feature_map_size x H//2 x W//2 + # -> feature_map_size*4 x H//2 x W//2 self.conv2 = VGGConv2d(feature_channels, feature_channels * 4) - # MaxPool2 (feature_map_size*4) x 32 x 16 - # -> (feature_map_size*4) x 16 x 8 - self.max_pool2 = nn.AdaptiveMaxPool2d((16, 8)) - # Conv3 (feature_map_size*4) x 16 x 8 - # -> (feature_map_size*8) x 16 x 8 + # MaxPool2 feature_map_size*4 x H//2 x W//2 + # -> feature_map_size*4 x H//4 x W//4 + self.max_pool2 = nn.AdaptiveMaxPool2d((h_2, w_2)) + # Conv3 feature_map_size*4 x H//4 x W//4 + # -> feature_map_size*8 x H//4 x W//4 self.conv3 = VGGConv2d(feature_channels * 4, feature_channels * 8) - # Conv4 (feature_map_size*8) x 16 x 8 - # -> (feature_map_size*8) x 16 x 8 (for large dataset) + # Conv4 feature_map_size*8 x H//4 x W//4 + # -> feature_map_size*8 x H//4 x W//4 (for large dataset) self.conv4 = VGGConv2d(feature_channels * 8, feature_channels * 8) - # MaxPool3 (feature_map_size*8) x 16 x 8 - # -> (feature_map_size*8) x 4 x 2 - self.max_pool3 = nn.AdaptiveMaxPool2d((4, 2)) + # MaxPool3 feature_map_size*8 x H//4 x W//4 + # -> feature_map_size*8 x H//16 x W//16 + self.max_pool3 = nn.AdaptiveMaxPool2d(self.feature_size) embedding_dim = sum(output_dims) - # FC (feature_map_size*8) * 4 * 2 -> 320 - self.fc = BasicLinear(feature_channels * 8 * 2 * 4, embedding_dim) + # FC feature_map_size*8 * H//16 * W//16 -> embedding_dim + self.fc = BasicLinear( + (feature_channels * 8) * self.h_3 * self.w_3, embedding_dim + ) def forward(self, x): x = self.conv1(x) @@ -55,7 +62,7 @@ class Encoder(nn.Module): x = self.conv3(x) x = self.conv4(x) x = self.max_pool3(x) - x = x.view(-1, (self.feature_channels * 8) * 2 * 4) + x = x.view(-1, (self.feature_channels * 8) * self.h_3 * self.w_3) embedding = self.fc(x) f_appearance, f_canonical, f_pose = embedding.split( @@ -71,36 +78,41 @@ class Decoder(nn.Module): self, input_dims: Tuple[int, int, int] = (128, 128, 64), feature_channels: int = 64, + feature_size: tuple[int, int] = (4, 3), out_channels: int = 3, ): super().__init__() self.feature_channels = feature_channels + self.h_0, self.w_0 = feature_size embedding_dim = sum(input_dims) - # FC 320 -> (feature_map_size*8) * 4 * 2 - self.fc = BasicLinear(embedding_dim, feature_channels * 8 * 2 * 4) + # FC 320 -> feature_map_size*8 * H * W + self.fc = BasicLinear( + embedding_dim, (feature_channels * 8) * self.h_0 * self.w_0 + ) - # TransConv1 (feature_map_size*8) x 4 x 2 - # -> (feature_map_size*4) x 8 x 4 + # TransConv1 feature_map_size*8 x H x W + # -> feature_map_size*4 x H*2 x W*2 self.trans_conv1 = DCGANConvTranspose2d(feature_channels * 8, feature_channels * 4) - # TransConv2 (feature_map_size*4) x 8 x 4 - # -> (feature_map_size*2) x 16 x 8 + # TransConv2 feature_map_size*4 x H*2 x W*2 + # -> feature_map_size*2 x H*4 x W*4 self.trans_conv2 = DCGANConvTranspose2d(feature_channels * 4, feature_channels * 2) - # TransConv3 (feature_map_size*2) x 16 x 8 - # -> feature_map_size x 32 x 16 + # TransConv3 feature_map_size*2 x H*4 x W*4 + # -> feature_map_size x H*8 x W*8 self.trans_conv3 = DCGANConvTranspose2d(feature_channels * 2, feature_channels) - # TransConv4 feature_map_size x 32 x 16 - # -> in_channels x 64 x 32 + # TransConv4 feature_map_size x H*8 x W*8 + # -> in_channels x H*16 x W*16 self.trans_conv4 = DCGANConvTranspose2d(feature_channels, out_channels, is_last_layer=True) def forward(self, f_appearance, f_canonical, f_pose, cano_only=False): x = torch.cat((f_appearance, f_canonical, f_pose), dim=1) x = self.fc(x) - x = F.relu(x.view(-1, self.feature_channels * 8, 4, 2), inplace=True) + x = x.view(-1, self.feature_channels * 8, self.h_0, self.w_0) + x = F.relu(x, inplace=True) x = self.trans_conv1(x) x = self.trans_conv2(x) if cano_only: @@ -115,12 +127,15 @@ class AutoEncoder(nn.Module): def __init__( self, channels: int = 3, + frame_size: tuple[int, int] = (64, 48), feature_channels: int = 64, embedding_dims: Tuple[int, int, int] = (128, 128, 64) ): super().__init__() - self.encoder = Encoder(channels, feature_channels, embedding_dims) - self.decoder = Decoder(embedding_dims, feature_channels, channels) + self.encoder = Encoder(channels, frame_size, + feature_channels, embedding_dims) + self.decoder = Decoder(embedding_dims, feature_channels, + self.encoder.feature_size, channels) def forward(self, x_c1_t2, x_c1_t1=None, x_c2_t2=None): n, t, c, h, w = x_c1_t2.size() diff --git a/models/layers.py b/models/layers.py index 98e4c10..ae61583 100644 --- a/models/layers.py +++ b/models/layers.py @@ -162,7 +162,7 @@ class BasicConv1d(nn.Module): return self.conv(x) -class HorizontalPyramidPooling(BasicConv2d): +class HorizontalPyramidPooling(nn.Module): def __init__( self, in_channels: int, @@ -172,8 +172,10 @@ class HorizontalPyramidPooling(BasicConv2d): use_max_pool: bool = False, **kwargs ): - super().__init__(in_channels, out_channels, kernel_size=1, **kwargs) + super().__init__() self.use_1x1conv = use_1x1conv + if use_1x1conv: + self.conv = BasicConv2d(in_channels, out_channels, 1, **kwargs) self.use_avg_pool = use_avg_pool self.use_max_pool = use_max_pool assert use_avg_pool or use_max_pool, 'Pooling layer(s) required.' @@ -188,5 +190,5 @@ class HorizontalPyramidPooling(BasicConv2d): elif not self.use_avg_pool and self.use_max_pool: x = self.max_pool(x) if self.use_1x1conv: - x = super().forward(x) + x = self.conv(x) return x diff --git a/models/model.py b/models/model.py index 2715b26..3d619fe 100644 --- a/models/model.py +++ b/models/model.py @@ -55,6 +55,7 @@ class Model: self.is_train: bool = True self.in_channels: int = 3 + self.in_size: tuple[int, int] = (64, 48) self.pr: Optional[int] = None self.k: Optional[int] = None @@ -147,7 +148,7 @@ class Model: hpm_optim_hp = optim_hp.pop('hpm', {}) fc_optim_hp = optim_hp.pop('fc', {}) sched_hp = self.hp.get('scheduler', {}) - self.rgb_pn = RGBPartNet(self.in_channels, **model_hp, + self.rgb_pn = RGBPartNet(self.in_channels, self.in_size, **model_hp, image_log_on=self.image_log_on) # Try to accelerate computation using CUDA or others self.rgb_pn = nn.DataParallel(self.rgb_pn) @@ -230,16 +231,16 @@ class Model: if self.image_log_on: i_a, i_c, i_p = images self.writer.add_images( + 'Appearance image', i_a, self.curr_iter + ) + self.writer.add_images( 'Canonical image', i_c, self.curr_iter ) - for (i, (o, a, p)) in enumerate(zip(x_c1, i_a, i_p)): + for i, (o, p) in enumerate(zip(x_c1, i_p)): self.writer.add_images( f'Original image/batch {i}', o, self.curr_iter ) self.writer.add_images( - f'Appearance image/batch {i}', a, self.curr_iter - ) - self.writer.add_images( f'Pose image/batch {i}', p, self.curr_iter ) time_used = datetime.now() - start_time @@ -306,7 +307,7 @@ class Model: # Init models model_hp = self.hp.get('model', {}) - self.rgb_pn = RGBPartNet(ae_in_channels=self.in_channels, **model_hp) + self.rgb_pn = RGBPartNet(self.in_channels, self.in_size, **model_hp) # Try to accelerate computation using CUDA or others self.rgb_pn = nn.DataParallel(self.rgb_pn) self.rgb_pn = self.rgb_pn.to(self.device) @@ -467,6 +468,7 @@ class Model: dataset_config: DatasetConfiguration ) -> Union[CASIAB]: self.in_channels = dataset_config.get('num_input_channels', 3) + self.in_size = dataset_config.get('frame_size', (64, 48)) self._dataset_sig = self._make_signature( dataset_config, popped_keys=['root_dir', 'cache_on'] diff --git a/models/rgb_part_net.py b/models/rgb_part_net.py index 845a477..80b3e17 100644 --- a/models/rgb_part_net.py +++ b/models/rgb_part_net.py @@ -13,6 +13,7 @@ class RGBPartNet(nn.Module): def __init__( self, ae_in_channels: int = 3, + ae_in_size: tuple[int, int] = (64, 48), ae_feature_channels: int = 64, f_a_c_p_dims: Tuple[int, int, int] = (128, 128, 64), hpm_use_1x1conv: bool = False, @@ -35,7 +36,7 @@ class RGBPartNet(nn.Module): self.image_log_on = image_log_on self.ae = AutoEncoder( - ae_in_channels, ae_feature_channels, f_a_c_p_dims + ae_in_channels, ae_in_size, ae_feature_channels, f_a_c_p_dims ) self.pn = PartNet( ae_in_channels, fpfe_feature_channels, fpfe_kernel_sizes, @@ -103,7 +104,7 @@ class RGBPartNet(nn.Module): i_a, i_c, i_p = None, None, None if self.image_log_on: - i_a = self._decode_appr_feature(f_a_, n, t, c, h, w, device) + i_a = self._decode_appr_feature(f_a_, n, t, device) # Continue decoding canonical features i_c = self.ae.decoder.trans_conv3(x_c) i_c = torch.sigmoid(self.ae.decoder.trans_conv4(i_c)) @@ -117,14 +118,14 @@ class RGBPartNet(nn.Module): x_p = self._decode_pose_feature(f_p_, n, t, c, h, w, device) return (x_c, x_p), None, None - def _decode_appr_feature(self, f_a_, n, t, c, h, w, device): + def _decode_appr_feature(self, f_a_, n, t, device): # Decode appearance features - x_a_ = self.ae.decoder( - f_a_, - torch.zeros((n * t, self.f_c_dim), device=device), - torch.zeros((n * t, self.f_p_dim), device=device) + f_a = f_a_.view(n, t, -1) + x_a = self.ae.decoder( + f_a.mean(1), + torch.zeros((n, self.f_c_dim), device=device), + torch.zeros((n, self.f_p_dim), device=device) ) - x_a = x_a_.view(n, t, c, h, w) return x_a def _decode_cano_feature(self, f_c_, n, t, device): diff --git a/preprocess.py b/preprocess.py index 894c7c1..eef59ba 100644 --- a/preprocess.py +++ b/preprocess.py @@ -52,7 +52,7 @@ def result_handler(frame_: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: mask_x1 = (mask_xc + mask_half_width).ceil().int() # Skip incomplete frames - if (height < 64 or width < 64 / MASK_BOX_RATIO) \ + if (height < 64 or width < 64 / BOX_RATIO_THRESHOLD[1]) \ or (mask_x0 < 0 or mask_x1 > 320) \ or (mask_y0 < 0 or mask_y1 > 240): continue |