loss_comparison.py

# we compare the loss function of several current state-of-the-art anomaly detection models
# we compare all the models in the same network architecture and the same hyper-parameters
# these models are all use limited number of labeled anomalies


'''
- 在统一的框架内比较下述几个loss function:
- deviation loss
- inverse loss
- minus loss
- score distribution loss (i.e., minimize the overlapped area)
'''

import copy
import numpy as np
import torch
from torch import nn
from torch.utils.data import Subset, DataLoader, TensorDataset
from tqdm import tqdm

from math import cos
import pandas as pd
import random
import scipy.stats as scp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

import os
import sys

from myutils import Utils
from data_generator import DataGenerator
from baseline.ADSD.fit import GaussianKDE, loss_overlap

class network(nn.Module):
    def __init__(self, input_size, batchnorm=False, abs=False):
        '''
        input_size: the input dimension of X
        batchnorm: whether to normalizing the network output score before loss forward
        '''
        super(network, self).__init__()
        self.abs = abs

        self.feature = nn.Sequential(
            nn.Linear(input_size, 20),
            nn.ReLU()
        )

        if batchnorm:
            self.reg = nn.Sequential(
                nn.Linear(20, 1),
                nn.BatchNorm1d(num_features=1)
            )
        else:
            self.reg = nn.Linear(20, 1)

    def forward(self, X):
        feature = self.feature(X)
        score = self.reg(feature)
        if self.abs:
            score = torch.abs(score)
            # score = nn.ReLU()(score)
        return feature, score.squeeze()

class network_pair(nn.Module):
    def __init__(self, input_size):
        super(network_pair, self).__init__()

        self.feature = nn.Sequential(
            nn.Linear(input_size, 20),
            nn.ReLU()
        )

        self.reg = nn.Linear(40, 1)

    #the input vector of prenet should be a pair
    def forward(self, X_left, X_right):
        feature_left = self.feature(X_left)
        feature_right = self.feature(X_right)

        # concat feature
        feature = torch.cat((feature_left, feature_right), dim=1)
        # generate score based on the concat feature
        score = self.reg(feature)

        return feature, score.squeeze()

class Comparison():
    def __init__(self,
                 epochs=200,
                 batch_size=256,
                 lr=1e-2,
                 mom=0.7,
                 weight_decay=1e-2,
                 seed=42,
                 n_pts=1000,
                 n_dim=2,
                 n_Gaussians=2,
                 anomaly_ratio=0.05,
                 la=0.05):

        # hyper-parameters
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.mom = mom
        self.weight_decay = weight_decay

        # utils function
        self.utils = Utils()

        # parameters for generating synthetic data
        self.seed = seed
        self.n_pts = n_pts
        self.n_dim = n_dim
        self.n_Gaussians = n_Gaussians
        self.anomaly_ratio = anomaly_ratio
        self.la = la

    # data preprocessing step
    def data_preprocess(self, X, y):
        # spliting the dataset to training and testing set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y)

        # Min-max scaler
        scaler = MinMaxScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # idx of normal samples and unlabeled/labeled anomalies
        idx_normal = np.where(y_train == 0)[0]
        idx_anomaly = np.where(y_train == 1)[0]

        assert type(self.la) == float
        idx_labeled_anomaly = np.random.choice(idx_anomaly, int(self.la * len(idx_anomaly)), replace=False)

        idx_unlabeled_anomaly = np.setdiff1d(idx_anomaly, idx_labeled_anomaly)
        # unlabel data = normal data + unlabeled anomalies (which is considered as contamination)
        idx_unlabeled = np.append(idx_normal, idx_unlabeled_anomaly)

        del idx_anomaly, idx_unlabeled_anomaly

        # the label of unlabeled data is 0, and that of labeled anomalies is 1
        y_train[idx_unlabeled] = 0
        y_train[idx_labeled_anomaly] = 1

        return {'X_train':X_train, 'X_test':X_test, 'y_train':y_train, 'y_test':y_test}

    # the original synthetic data points by Gaussian mixture
    def data_generator_synthetic(self, mode=None):
        '''
        mode: can be either "Cluster" or "GM"
        Cluster: the normal data and anomalies are generated by different Gaussian distribution (different mean)
        GM: the data is generated by the Gaussian mixture and the least likelihood data points are regarded as anomalies
        '''

        # set seed
        self.utils.set_seed(self.seed)

        if mode == 'Cluster':
            pts = []

            means = []
            covs = []
            for i in range(self.n_Gaussians):
                a = 4 * (i+1)  # a = np.random.randint(0, 3 * n_Gaussians), i+1, by bug
                b = 4 * (i+1)  # b = np.random.randint(0, 3 * n_Gaussians)
                c = np.random.randint(0, 1000)

                means.append([a, b])
                cov = [[1, cos(c)], [cos(c), 2]]  # cov = [[0.3 + 3*i,0.1],[0.1,0.1 + 3*i]]

                covs.append(cov)

                if i == 0:
                    pts.append(np.random.multivariate_normal(means[i], covs[i],
                                                             int(self.n_pts * (1 - self.anomaly_ratio))))
                elif i == 1:
                    pts.append(np.random.multivariate_normal(means[i], covs[i],
                                                             int(self.n_pts * (self.anomaly_ratio))))
                else:
                    raise NotImplementedError

            # combine the samples for n_Gaussian different distributions
            X = np.vstack(pts)

            # generate the ground-truth label
            y = np.append(np.repeat(0, int(self.n_pts * (1 - self.anomaly_ratio))),
                          np.repeat(1, int(self.n_pts * (self.anomaly_ratio))))

        elif mode == 'GM':
            means = []
            covs = []
            for i in range(self.n_Gaussians):
                a = 4 * (i+1) # a = np.random.randint(0, 3 * n_Gaussians), i+1, by bug
                b = 4 * (i+1) # b = np.random.randint(0, 3 * n_Gaussians)
                c = np.random.randint(0, 1000)
                cov = [[1, cos(c)], [cos(c), 2]]

                means.append([a, b])
                covs.append(cov)

            pts = []
            for i in range(self.n_Gaussians):
                pts.append(np.random.multivariate_normal(means[i], covs[i], self.n_pts // self.n_Gaussians))

            # combine the samples for n_Gaussian different distributions
            X = np.vstack(pts)

            # generate the ground-truth label, where the anoamlies are the top-k smallest liklihood sample points
            score_clf = []
            for i in range(self.n_Gaussians):
                score_clf.append(scp.multivariate_normal(mean=means[i], cov=covs[i]))

            l_scores = np.zeros(X.shape[0])
            for i in range(len(l_scores)):
                for j in range(self.n_Gaussians):
                    l_scores[i] += score_clf[j].pdf(X[i])  # the likelihood of the two dimensions

            y = np.repeat(0, X.shape[0])

            # the top 1% of the likelihood score are the anomalies
            y[np.argsort(l_scores)[:int(self.anomaly_ratio * len(y))]] = 1

            # # the likelihood is treated as the ground-truth anomaly score
            # y = np.concatenate((y.reshape(-1, 1), l_scores.reshape(-1, 1)), axis=1)

        else:
            raise NotImplementedError

        return X, y

    # inverse loss
    def fit_with_inverse_loss(self, X_test_tensor=None, y_test=None):
        loss_epoch, metric_epoch = [], []
        for epoch in range(self.epochs):
            with torch.no_grad():
                _, score_test = self.model(X_test_tensor)
                score_test = score_test.numpy()
                performance = self.utils.metric(y_true=y_test, y_score=score_test)
                metric_epoch.append(performance)

            loss_batch = []
            for data in self.train_loader:
                X, y = data
                # transfer the labels
                y[y == 1] = -1
                y[y == 0] = 1

                # clear gradient
                self.model.zero_grad()

                # loss forward
                _, score = self.model(X)
                loss = torch.mean(torch.pow(score, y))

                # loss backward
                loss.backward()
                loss_batch.append(loss.detach().numpy()[()])

                # update
                self.optimizer.step()

            loss_epoch.append(np.mean(loss_batch))

        return loss_epoch, metric_epoch

    # minus loss
    def fit_with_minus_loss(self, X_test_tensor=None, y_test=None):
        loss_epoch, metric_epoch = [], []
        for epoch in range(self.epochs):
            with torch.no_grad():
                _, score_test = self.model(X_test_tensor)
                score_test = score_test.numpy()
                performance = self.utils.metric(y_true=y_test, y_score=score_test)
                metric_epoch.append(performance)

            loss_batch = []
            for data in self.train_loader:
                X, y = data

                # clear gradient
                self.model.zero_grad()

                # loss forward
                _, score = self.model(X)
                # loss = torch.mean(score[y == 0]) - torch.mean(score[y == 1])
                loss = torch.mean(score[y == 0] + torch.max(torch.zeros_like(score[y == 1]), 5.0 - score[y == 1]))

                # loss backward
                loss.backward()
                loss_batch.append(loss.detach().numpy()[()])

                # update
                self.optimizer.step()

            loss_epoch.append(np.mean(loss_batch))

        return loss_epoch, metric_epoch

    # hinge loss
    def fit_with_hinge_loss(self, X_test_tensor=None, y_test=None):
        # the ranking loss (hinge loss)
        ranking_loss = torch.nn.MarginRankingLoss(margin=5.0)

        loss_epoch, metric_epoch = [], []
        for epoch in range(self.epochs):
            with torch.no_grad():
                _, score_test = self.model(X_test_tensor)
                score_test = score_test.numpy()
                performance = self.utils.metric(y_true=y_test, y_score=score_test)
                metric_epoch.append(performance)

            loss_batch = []
            for data in self.train_loader:
                X, y = data

                # clear gradient
                self.model.zero_grad()

                # loss forward
                _, score = self.model(X)
                score_u = score[y==0]
                score_a = score[y==1]
                loss = ranking_loss(score_a, score_u, torch.ones_like(score_a))

                # loss backward
                loss.backward()
                loss_batch.append(loss.detach().numpy()[()])

                # update
                self.optimizer.step()

            loss_epoch.append(np.mean(loss_batch))

        return loss_epoch, metric_epoch

    # deviation loss
    def fit_with_deviation_loss(self, X_test_tensor=None, y_test=None):
        loss_epoch, metric_epoch = [], []
        for epoch in range(self.epochs):
            with torch.no_grad():
                _, score_test = self.model(X_test_tensor)
                score_test = score_test.numpy()
                performance = self.utils.metric(y_true=y_test, y_score=score_test)
                metric_epoch.append(performance)

            loss_batch = []
            for data in self.train_loader:
                X, y = data

                # clear gradient
                self.model.zero_grad()

                # loss forward
                _, score = self.model(X)
                loss = self.utils.cal_loss(y=y, y_pred=score)

                # loss backward
                loss.backward()
                loss_batch.append(loss.detach().numpy()[()])

                # update
                self.optimizer.step()

            loss_epoch.append(np.mean(loss_batch))

        return loss_epoch, metric_epoch

    # ordinal loss
    def fit_with_ordinal_loss(self, X_test_tensor=None, y_test=None):
        loss_epoch, metric_epoch = [], []
        for epoch in range(self.epochs):
            # generate the batch samples
            X_train_loader, y_train_loader = self.utils.sampler_pairs(self.X_train_tensor, self.y_train,
                                             epoch=self.epochs, batch_num=20, batch_size=self.batch_size,
                                             s_a_a=8.0, s_a_u=4.0, s_u_u=0.0)

            with torch.no_grad():
                score_test = self.predict_score(self.model, self.X_train_tensor, self.y_train, X_test_tensor)
                performance = self.utils.metric(y_true=y_test, y_score=score_test)
                metric_epoch.append(performance)

            loss_batch = []
            for i in range(len(X_train_loader)):
                X_left, X_right = X_train_loader[i][0], X_train_loader[i][1]
                y = y_train_loader[i]

                # clear gradient
                self.model.zero_grad()

                # loss forward
                _, score = self.model(X_left, X_right)
                loss = torch.mean(torch.abs(y - score))

                # loss backward
                loss.backward()
                loss_batch.append(loss.detach().numpy()[()])
                # update model parameters
                self.optimizer.step()

            loss_epoch.append(np.mean(loss_batch))

        return loss_epoch, metric_epoch

    # the input of anomaly detection model trained by the ordinal loss should ba a pair
    def predict_score(self, model, X_train_tensor, y_train, X_test_tensor, num=30):
        model = model.eval()

        score = []
        for i in range(X_test_tensor.size(0)):
            index_a = np.random.choice(np.where(self.y_train==1)[0], num, replace=True) #postive sample in training set
            index_u = np.random.choice(np.where(self.y_train==0)[0], num, replace=True) #negative sample in training set

            X_train_a_tensor = self.X_train_tensor[index_a]
            X_train_u_tensor = self.X_train_tensor[index_u]

            #注意x的顺序需要在u之前，即(a,u)而非(u,a)对
            with torch.no_grad():
                _, score_a_x = self.model(X_train_a_tensor, torch.cat(num * [X_test_tensor[i].view(1, -1)]))
                _, score_x_u = self.model(torch.cat(num * [X_test_tensor[i].view(1, -1)]), X_train_u_tensor)

            score_sub = torch.mean(score_a_x + score_x_u)
            score_sub = score_sub.numpy()[()]

            #entire score
            score.append(score_sub)

        return np.array(score)

    # score distribution based loss
    def fit_with_score_loss(self, X_test_tensor=None, y_test=None):
        loss_epoch, metric_epoch = [], []
        for epoch in range(self.epochs):
            with torch.no_grad():
                _, score_test = self.model(X_test_tensor)
                score_test = score_test.numpy()
                performance = self.utils.metric(y_true=y_test, y_score=score_test)
                metric_epoch.append(performance)

            loss_batch = []
            for i, data in enumerate(self.train_loader):
                X, y = data

                # clear gradient
                self.model.zero_grad()

                # loss forward
                _, score = self.model(X)
                score_u = score[y==0]
                score_a = score[y==1]

                # combine the loss function
                loss = loss_overlap(s_u=score_u, s_a=score_a, seed=self.seed, bw_u=1.0, bw_a=1.0, pro=True)

                # loss backward
                loss.backward()
                loss_batch.append(loss.detach().item())
                # update
                self.optimizer.step()

            loss_epoch.append(np.mean(loss_batch))

        return loss_epoch, metric_epoch

    # fitting function
    def fit2test(self, data=None, dataset=None,
                 synthetic=False, synthetic_mode=None,
                 realistic_synthetic_mode=None,
                 resampling=False, loss_name=None):
        '''
        data: can provide any data by the dict form
        dataset: for real-world dataset
        synthetic: whether to generate synthetic data (Gaussian mixture) for evaluating models
        resampling: whether to use the batch resampling strategy
        loss_name: choice which loss to fit the model
        '''
        # generate synthetic data
        if data is not None:
            pass
        elif synthetic:
            X, y = self.data_generator_synthetic(mode=synthetic_mode)
            data = self.data_preprocess(X=X, y=y)
        # import real-world dataset
        else:
            generator = DataGenerator(seed=self.seed, dataset=dataset,
                                      generate_duplicates=True,
                                      n_samples_threshold=1000,
                                      n_samples_up_threshold=1000,
                                      show_statistic=False)
            data = generator.generator(la=self.la, realistic_synthetic_mode=realistic_synthetic_mode)

        # training set
        self.X_train, self.y_train = data['X_train'], data['y_train']
        self.X_train_tensor = torch.from_numpy(self.X_train).float()
        self.input_size = self.X_train.shape[1]

        if resampling:
            self.X_train, self.y_train = self.utils.sampler(self.X_train, self.y_train, self.batch_size)

        train_tensor = TensorDataset(torch.from_numpy(self.X_train).float(), torch.tensor(self.y_train))
        self.train_loader = DataLoader(train_tensor, batch_size=self.batch_size, shuffle=False, drop_last=True)

        # testing set
        X_test, y_test = data['X_test'], data['y_test']
        X_test_tensor = torch.from_numpy(X_test).float()

        # initialization the model
        if loss_name == 'ordinal_loss':
            self.model = network_pair(input_size=self.input_size)
        elif loss_name == 'score_distribution_loss':
            self.model = network(input_size=self.input_size, batchnorm=True)
        elif loss_name in ['inverse_loss', 'minus_loss']:
            # we keep the anomaly score as positive for consistent with the original paper
            self.model = network(input_size=self.input_size, abs=True)
        else:
            self.model = network(input_size=self.input_size)

        # save the initial model parameter
        self.model_init = copy.deepcopy(self.model)

        # optimizer
        if loss_name == 'score_distribution_loss':
            self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.mom,
                                             weight_decay=self.weight_decay)
        else:
            self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.lr,
                                                 weight_decay=self.weight_decay)

        # fitting
        if loss_name == 'inverse_loss':
            loss_epoch, metric_epoch = self.fit_with_inverse_loss(X_test_tensor, y_test)

        elif loss_name == 'minus_loss':
            loss_epoch, metric_epoch = self.fit_with_minus_loss(X_test_tensor, y_test)

        elif loss_name == 'hinge_loss':
            loss_epoch, metric_epoch = self.fit_with_hinge_loss(X_test_tensor, y_test)

        elif loss_name == 'ordinal_loss':
            loss_epoch, metric_epoch = self.fit_with_ordinal_loss(X_test_tensor, y_test)

        elif loss_name == 'deviation_loss':
            loss_epoch, metric_epoch = self.fit_with_deviation_loss(X_test_tensor, y_test)

        elif loss_name == 'score_distribution_loss':
            loss_epoch, metric_epoch = self.fit_with_score_loss(X_test_tensor, y_test)

        else:
            print('The loss function name should be in [inverse_loss, minus_loss,'
                  ' hinge_loss, deviation_loss, ordinal_loss, score_distribution_loss]')
            raise NotImplementedError

        with torch.no_grad():
            if loss_name == 'ordinal_loss':
                score_test = self.predict_score(self.model, self.X_train_tensor, self.y_train, X_test_tensor)
                feature, _ = self.model(X_test_tensor, X_test_tensor)
                feature = feature.numpy()
            else:
                feature, score_test = self.model(X_test_tensor)
                feature = feature.numpy()
                score_test = score_test.numpy()

        performance = self.utils.metric(y_true=y_test, y_score=score_test)

        return {'data': data,
                'model_init': self.model_init,
                'model': self.model,
                'loss':loss_epoch,
                'metric_epoch': metric_epoch,
                'score': score_test,
                'feature': feature,
                'performance': performance}


# # init
# com = Comparison()
# # score distribution loss
# result_score_loss = com.fit2test(data=None, dataset=None, synthetic=True, synthetic_mode='GM',
#                                  resampling=True, loss_name='score_distribution_loss')
# print(result_score_loss['performance'])