From 263b8001ca1b25a43d1c87f187423054e141925d Mon Sep 17 00:00:00 2001 From: Jordan Gong Date: Sun, 14 Mar 2021 21:07:03 +0800 Subject: Fix unbalanced datasets --- utils/sampler.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'utils') diff --git a/utils/sampler.py b/utils/sampler.py index cdf1984..0c9872c 100644 --- a/utils/sampler.py +++ b/utils/sampler.py @@ -16,7 +16,18 @@ class TripletSampler(data.Sampler): ): super().__init__(data_source) self.metadata_labels = data_source.metadata['labels'] + metadata_conditions = data_source.metadata['conditions'] + self.subsets = {} + for condition in metadata_conditions: + pre, _ = condition.split('-') + if self.subsets.get(pre, None) is None: + self.subsets[pre] = [] + self.subsets[pre].append(condition) + self.num_subsets = len(self.subsets) + self.num_seq = {pre: len(seq) for (pre, seq) in self.subsets.items()} + self.min_num_seq = min(self.num_seq.values()) self.labels = data_source.labels + self.conditions = data_source.conditions self.length = len(self.labels) self.indexes = np.arange(0, self.length) (self.pr, self.k) = batch_size @@ -27,15 +38,31 @@ class TripletSampler(data.Sampler): # Sample pr subjects by sampling labels appeared in dataset sampled_subjects = random.sample(self.metadata_labels, k=self.pr) for label in sampled_subjects: - clips_from_subject = self.indexes[self.labels == label].tolist() + mask = self.labels == label + # Fix unbalanced datasets + if self.num_subsets > 1: + condition_mask = np.zeros(self.conditions.shape, dtype=bool) + for num, conditions_ in zip( + self.num_seq.values(), self.subsets.values() + ): + if num > self.min_num_seq: + conditions = random.sample( + conditions_, self.min_num_seq + ) + else: + conditions = conditions_ + for condition in conditions: + condition_mask |= self.conditions == condition + mask &= condition_mask + clips = self.indexes[mask].tolist() # Sample k clips from the subject without replacement if # have enough clips, k more clips will sampled for # disentanglement k = self.k * 2 - if len(clips_from_subject) >= k: - _sampled_indexes = random.sample(clips_from_subject, k=k) + if len(clips) >= k: + _sampled_indexes = random.sample(clips, k=k) else: - _sampled_indexes = random.choices(clips_from_subject, k=k) + _sampled_indexes = random.choices(clips, k=k) sampled_indexes += _sampled_indexes yield sampled_indexes -- cgit v1.2.3 From b6e5972b64cc61fc967cf3d098fc629d781adce4 Mon Sep 17 00:00:00 2001 From: Jordan Gong Date: Mon, 22 Mar 2021 19:32:16 +0800 Subject: Add embedding visualization and validate on testing set --- utils/configuration.py | 1 + utils/triplet_loss.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'utils') diff --git a/utils/configuration.py b/utils/configuration.py index f6ac182..157d249 100644 --- a/utils/configuration.py +++ b/utils/configuration.py @@ -14,6 +14,7 @@ class DatasetConfiguration(TypedDict): name: str root_dir: str train_size: int + val_size: int num_sampled_frames: int truncate_threshold: int discard_threshold: int diff --git a/utils/triplet_loss.py b/utils/triplet_loss.py index 03fff21..5e3a97a 100644 --- a/utils/triplet_loss.py +++ b/utils/triplet_loss.py @@ -28,6 +28,7 @@ class BatchTripletLoss(nn.Module): else: # is_all positive_negative_dist = self._all_distance(dist, y, p, n) + non_zero_counts = None if self.margin: losses = F.relu(self.margin + positive_negative_dist).view(p, -1) non_zero_counts = (losses != 0).sum(1).float() @@ -35,14 +36,18 @@ class BatchTripletLoss(nn.Module): loss_metric = self._none_zero_mean(losses, non_zero_counts) else: # is_sum loss_metric = losses.sum(1) - return loss_metric, flat_dist, non_zero_counts else: # Soft margin losses = F.softplus(positive_negative_dist).view(p, -1) if self.is_mean: loss_metric = losses.mean(1) else: # is_sum loss_metric = losses.sum(1) - return loss_metric, flat_dist, None + + return { + 'loss': loss_metric, + 'dist': flat_dist, + 'counts': non_zero_counts + } @staticmethod def _batch_distance(x): -- cgit v1.2.3 From 5a063855dbecb8f1a86ad25d9e61a9c8b63312b3 Mon Sep 17 00:00:00 2001 From: Jordan Gong Date: Thu, 25 Mar 2021 12:23:23 +0800 Subject: Bug fixes and refactoring 1. Correct trained model signature 2. Move `val_size` to system config --- utils/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'utils') diff --git a/utils/configuration.py b/utils/configuration.py index 157d249..5a5bc0c 100644 --- a/utils/configuration.py +++ b/utils/configuration.py @@ -8,13 +8,13 @@ class SystemConfiguration(TypedDict): CUDA_VISIBLE_DEVICES: str save_dir: str image_log_on: bool + val_size: int class DatasetConfiguration(TypedDict): name: str root_dir: str train_size: int - val_size: int num_sampled_frames: int truncate_threshold: int discard_threshold: int -- cgit v1.2.3