import argparse
import os
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Callable

import torch
import yaml
from torch.utils.data import Dataset
from torchvision.datasets import CIFAR10, CIFAR100, ImageNet
from torchvision.transforms import transforms

path = str(Path(Path(__file__).parent.absolute()).parent.absolute())
sys.path.insert(0, path)

from libs.criteria import InfoNCELoss
from libs.datautils import color_distortion, Clip, RandomGaussianBlur, TwinTransform
from libs.optimizers import LARS
from libs.schedulers import LinearWarmupAndCosineAnneal, LinearLR
from libs.utils import Trainer, BaseConfig
from libs.logging import BaseBatchLogRecord, Loggers
from simclr.models import CIFARSimCLRResNet50, ImageNetSimCLRResNet50


def parse_args_and_config():
    parser = argparse.ArgumentParser(
        description='Contrastive baseline SimCLR',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument('--codename', default='cifar10-simclr-128-lars-warmup',
                        type=str, help="Model descriptor")
    parser.add_argument('--log-dir', default='logs', type=str,
                        help="Path to log directory")
    parser.add_argument('--checkpoint-dir', default='checkpoints', type=str,
                        help="Path to checkpoints directory")
    parser.add_argument('--seed', default=None, type=int,
                        help='Random seed for reproducibility')
    parser.add_argument('--num-iters', default=23438, type=int,
                        help='Number of iters (default is 50 epochs equiv., '
                             'around dataset_size * epochs / batch_size)')
    parser.add_argument('--config', type=argparse.FileType(mode='r'),
                        help='Path to config file (optional)')

    # TODO: Add model hyperparams dataclass
    parser.add_argument('--hid-dim', default=2048, type=int,
                        help='Number of dimension of embedding')
    parser.add_argument('--out-dim', default=128, type=int,
                        help='Number of dimension after projection')
    parser.add_argument('--temp', default=0.5, type=float,
                        help='Temperature in InfoNCE loss')

    dataset_group = parser.add_argument_group('Dataset parameters')
    dataset_group.add_argument('--dataset-dir', default='dataset', type=str,
                               help="Path to dataset directory")
    dataset_group.add_argument('--dataset', default='cifar10', type=str,
                               choices=('cifar10, cifar100', 'imagenet'),
                               help="Name of dataset")
    dataset_group.add_argument('--crop-size', default=32, type=int,
                               help='Random crop size after resize')
    dataset_group.add_argument('--crop-scale-range', nargs=2, default=(0.8, 1),
                               type=float, help='Random resize scale range',
                               metavar=('start', 'stop'))
    dataset_group.add_argument('--hflip-prob', default=0.5, type=float,
                               help='Random horizontal flip probability')
    dataset_group.add_argument('--distort-strength', default=0.5, type=float,
                               help='Distortion strength')
    dataset_group.add_argument('--gauss-ker-scale', default=10, type=float,
                               help='Gaussian kernel scale factor '
                                    '(s = img_size / ker_size)')
    dataset_group.add_argument('--gauss-sigma-range', nargs=2, default=(0.1, 2),
                               type=float, help='Random gaussian blur sigma range',
                               metavar=('start', 'stop'))
    dataset_group.add_argument('--gauss-prob', default=0.5, type=float,
                               help='Random gaussian blur probability')

    dataloader_group = parser.add_argument_group('Dataloader parameters')
    dataloader_group.add_argument('--batch-size', default=128, type=int,
                                  help='Batch size')
    dataloader_group.add_argument('--num-workers', default=2, type=int,
                                  help='Number of dataloader processes')

    optim_group = parser.add_argument_group('Optimizer parameters')
    optim_group.add_argument('--optim', default='lars', type=str,
                             choices=('adam', 'sgd', 'lars'),
                             help="Name of optimizer")
    optim_group.add_argument('--lr', default=1., type=float,
                             help='Learning rate')
    optim_group.add_argument('--betas', nargs=2, default=(0.9, 0.999), type=float,
                             help='Adam betas', metavar=('beta1', 'beta2'))
    optim_group.add_argument('--momentum', default=0.9, type=float,
                             help='SDG momentum')
    optim_group.add_argument('--weight-decay', default=1e-6, type=float,
                             help='Weight decay (l2 regularization)')

    sched_group = parser.add_argument_group('Scheduler parameters')
    sched_group.add_argument('--sched', default='warmup-anneal', type=str,
                             choices=('const', None, 'linear', 'warmup-anneal'),
                             help="Name of scheduler")
    sched_group.add_argument('--warmup-iters', default=2344, type=int,
                             help='Epochs for warmup (`warmup-anneal` scheduler only)')

    args = parser.parse_args()
    if args.config:
        config = yaml.safe_load(args.config)
        args.__dict__ |= {
            k: tuple(v) if isinstance(v, list) else v
            for k, v in config.items()
        }
    args.checkpoint_dir = os.path.join(args.checkpoint_dir, args.codename)
    args.log_dir = os.path.join(args.log_dir, args.codename)

    return args


@dataclass
class SimCLRConfig(BaseConfig):
    @dataclass
    class DatasetConfig(BaseConfig.DatasetConfig):
        dataset_dir: str
        crop_size: int
        crop_scale_range: tuple[float, float]
        hflip_prob: float
        distort_strength: float
        gauss_ker_scale: float
        gauss_sigma_range: tuple[float, float]
        gauss_prob: float

    @dataclass
    class OptimConfig(BaseConfig.OptimConfig):
        momentum: float
        betas: tuple[float, float]
        weight_decay: float


class SimCLRTrainer(Trainer):
    def __init__(self, hid_dim, out_dim, **kwargs):
        self.hid_dim = hid_dim
        self.out_dim = out_dim
        super(SimCLRTrainer, self).__init__(**kwargs)

    @dataclass
    class BatchLogRecord(BaseBatchLogRecord):
        lr: float | None
        train_loss: float | None
        train_accuracy: float | None
        eval_loss: float | None
        eval_accuracy: float | None

    @staticmethod
    def _prepare_dataset(dataset_config: SimCLRConfig.DatasetConfig) -> tuple[Dataset, Dataset]:
        basic_augmentation = transforms.Compose([
            transforms.RandomResizedCrop(
                dataset_config.crop_size,
                scale=dataset_config.crop_scale_range,
                interpolation=transforms.InterpolationMode.BICUBIC
            ),
            transforms.RandomHorizontalFlip(dataset_config.hflip_prob),
            color_distortion(dataset_config.distort_strength),
        ])
        if dataset_config.dataset in {'cifar10', 'cifar100', 'cifar'}:
            transform = transforms.Compose([
                basic_augmentation,
                transforms.ToTensor(),
                Clip(),
            ])
            if dataset_config.dataset in {'cifar10', 'cifar'}:
                train_set = CIFAR10(dataset_config.dataset_dir, train=True,
                                    transform=TwinTransform(transform),
                                    download=True)
                test_set = CIFAR10(dataset_config.dataset_dir, train=False,
                                   transform=TwinTransform(transform))
            else:  # CIFAR-100
                train_set = CIFAR100(dataset_config.dataset_dir, train=True,
                                     transform=TwinTransform(transform),
                                     download=True)
                test_set = CIFAR100(dataset_config.dataset_dir, train=False,
                                    transform=TwinTransform(transform))
        elif dataset_config.dataset in {'imagenet1k', 'imagenet'}:
            random_gaussian_blur = RandomGaussianBlur(
                kernel_size=dataset_config.crop_size // dataset_config.gauss_ker_scale,
                sigma_range=dataset_config.gauss_sigma_range,
                p=dataset_config.gauss_prob
            ),
            transform = transforms.Compose([
                basic_augmentation,
                random_gaussian_blur,
                transforms.ToTensor(),
                Clip()
            ])
            train_set = ImageNet(dataset_config.dataset_dir, 'train',
                                 transform=TwinTransform(transform))
            test_set = ImageNet(dataset_config.dataset_dir, 'val',
                                transform=TwinTransform(transform))
        else:
            raise NotImplementedError(f"Unimplemented dataset: '{dataset_config.dataset}")

        return train_set, test_set

    def _init_models(self, dataset: str) -> Iterable[tuple[str, torch.nn.Module]]:
        if dataset in {'cifar10', 'cifar100', 'cifar'}:
            model = CIFARSimCLRResNet50(self.hid_dim, self.out_dim)
        elif dataset in {'imagenet1k', 'imagenet'}:
            model = ImageNetSimCLRResNet50(self.hid_dim, self.out_dim)
        else:
            raise NotImplementedError(f"Unimplemented dataset: '{dataset}")

        yield 'model', model

    @staticmethod
    def _configure_optimizers(
            models: Iterable[tuple[str, torch.nn.Module]],
            optim_config: SimCLRConfig.OptimConfig,
    ) -> Iterable[tuple[str, torch.optim.Optimizer]]:
        def exclude_from_wd_and_adaptation(name):
            if 'bn' in name:
                return True
            if optim_config.optim == 'lars' and 'bias' in name:
                return True

        for model_name, model in models:
            param_groups = [
                {
                    'params': [p for name, p in model.named_parameters()
                               if not exclude_from_wd_and_adaptation(name)],
                    'weight_decay': optim_config.weight_decay,
                    'layer_adaptation': True,
                },
                {
                    'params': [p for name, p in model.named_parameters()
                               if exclude_from_wd_and_adaptation(name)],
                    'weight_decay': 0.,
                    'layer_adaptation': False,
                },
            ]
            if optim_config.optim == 'adam':
                optimizer = torch.optim.Adam(
                    param_groups,
                    lr=optim_config.lr,
                    betas=optim_config.betas,
                )
            elif optim_config.optim in {'sgd', 'lars'}:
                optimizer = torch.optim.SGD(
                    param_groups,
                    lr=optim_config.lr,
                    momentum=optim_config.momentum,
                )
            else:
                raise NotImplementedError(f"Unimplemented optimizer: '{optim_config.optim}'")

            yield f"{model_name}_optim", optimizer

    def _custom_init_fn(self, config: SimCLRConfig):
        self.optims = {n: LARS(o) if config.optim_config.optim == 'lars' else o
                       for n, o in self.optims.items()}

    def train(self, num_iters: int, loss_fn: Callable, logger: Loggers, device: torch.device):
        model = self.models['model']
        optim = self.optims['model_optim']
        sched = self.scheds['model_optim_sched']
        train_loader = iter(self.train_loader)
        model.train()
        for iter_ in range(self.restore_iter, num_iters):
            input_, _ = next(train_loader)
            input_ = torch.cat(input_).to(device)
            model.zero_grad()
            output = model(input_)
            train_loss, train_accuracy = loss_fn(output)
            train_loss.backward()
            optim.step()
            self.log(logger, self.BatchLogRecord(
                iter_, num_iters, iter_, iter_, num_iters,
                optim.param_groups[0]['lr'],
                train_loss.item(), train_accuracy.item(),
                eval_loss=None, eval_accuracy=None,
            ))
            if (iter_ + 1) % (num_iters // 100) == 0:
                metrics = torch.Tensor(list(self.eval(loss_fn, device))).mean(0)
                eval_loss = metrics[0].item()
                eval_accuracy = metrics[1].item()
                eval_log = self.BatchLogRecord(
                    iter_, num_iters, iter_, iter_, num_iters,
                    lr=None, train_loss=None, train_accuracy=None,
                    eval_loss=eval_loss, eval_accuracy=eval_accuracy,
                )
                self.log(logger, eval_log)
                self.save_checkpoint(eval_log)
                model.train()
            if sched is not None:
                sched.step()

    def eval(self, loss_fn: Callable, device: torch.device):
        model = self.models['model']
        model.eval()
        with torch.no_grad():
            for input_, _ in self.test_loader:
                input_ = torch.cat(input_).to(device)
                output = model(input_)
                loss, accuracy = loss_fn(output)
                yield loss.item(), accuracy.item()


if __name__ == '__main__':
    args = parse_args_and_config()
    config = SimCLRConfig.from_args(args)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    trainer = SimCLRTrainer(
        seed=args.seed,
        checkpoint_dir=args.checkpoint_dir,
        device=device,
        inf_mode=True,
        num_iters=args.num_iters,
        config=config,
        hid_dim=args.hid_dim,
        out_dim=args.out_dim,
    )

    loggers = trainer.init_logger(args.log_dir)
    trainer.train(args.num_iters, InfoNCELoss(args.temp), loggers, device)