diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..54acd81
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,414 @@
+import os
+import numpy as np
+import scanpy as sc
+import pandas as pd
+import anndata
+import umap
+from portal.model import *
+import rpy2.robjects as ro
+import rpy2.robjects.numpy2ri
+import anndata2ri
+from sklearn.neighbors import NearestNeighbors
+
+def preprocess_datasets(adata_list, # list of anndata to be integrated
+                        hvg_num=4000, # number of highly variable genes for each anndata
+                        save_embedding=False, # save low-dimensional embeddings or not
+                        data_path="data"
+                        ):
+
+    if len(adata_list) < 2:
+        raise ValueError("There should be at least two datasets for integration!")
+
+    sample_size_list = []
+
+    print("Finding highly variable genes...")
+    for i, adata in enumerate(adata_list):
+        sample_size_list.append(adata.shape[0])
+        # adata = adata_input.copy()
+        sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=hvg_num)
+        hvg = adata.var[adata.var.highly_variable == True].sort_values(by="highly_variable_rank").index
+        if i == 0:
+            hvg_total = hvg
+        else:
+            hvg_total = hvg_total & hvg
+        if len(hvg_total) < 100:
+            raise ValueError("The total number of highly variable genes is smaller than 100 (%d). Try to set a larger hvg_num." % len(hvg_total))
+
+    print("Normalizing and scaling...")
+    for i, adata in enumerate(adata_list):
+        # adata = adata_input.copy()
+        sc.pp.normalize_total(adata, target_sum=1e4)
+        sc.pp.log1p(adata)
+        adata = adata[:, hvg_total]
+        sc.pp.scale(adata, max_value=10)
+        if i == 0:
+            adata_total = adata
+        else:
+            adata_total = adata_total.concatenate(adata, index_unique=None)
+
+    print("Dimensionality reduction via PCA...")
+    npcs = 30
+    pca = PCA(n_components=npcs, svd_solver="arpack", random_state=0)
+    adata_total.obsm["X_pca"] = pca.fit_transform(adata_total.X)
+
+    indices = np.cumsum(sample_size_list)
+
+    data_path = os.path.join(data_path, "preprocess")
+
+    if not os.path.exists(data_path):
+        os.makedirs(data_path)
+
+    if save_embedding:
+        for i in range(len(indices)):
+            if i == 0:
+                np.save(os.path.join(data_path, "lowdim_1.npy"), 
+                        adata_total.obsm["X_pca"][:indices[0], :npcs])
+            else:
+                np.save(os.path.join(data_path, "lowdim_%d.npy" % (i + 1)), 
+                        adata_total.obsm["X_pca"][indices[i-1]:indices[i], :npcs])
+
+    lowdim = adata_total.obsm["X_pca"].copy()
+    lowdim_list = [lowdim[:indices[0], :npcs] if i == 0 else lowdim[indices[i - 1]:indices[i], :npcs] for i in range(len(indices))]
+
+    return lowdim_list
+
+
+def integrate_datasets(lowdim_list, # list of low-dimensional representations
+                       search_cos=False, # searching for an optimal lambdacos
+                       lambda_cos=20.0,
+                       training_steps=2000,
+                       space=None, # None or "reference" or "latent"
+                       data_path="data"
+                       ):
+
+    if space == None:
+        if len(lowdim_list) == 2:
+            space = "latent"
+        else:
+            space = "reference"
+
+    print("Incrementally integrating %d datasets..." % len(lowdim_list))
+
+    if not search_cos:
+        # if not search hyperparameter lambdacos
+        for i in range(len(lowdim_list) - 1):
+            print("Integrating the %d-th dataset to the 1-st dataset..." % (i + 2))
+            model = Model(lambdacos=lambda_cos,
+                          training_steps=training_steps, 
+                          data_path=os.path.join(data_path, "preprocess"), 
+                          model_path="models/%d_datasets" % (i + 2), 
+                          result_path="results/%d_datasets" % (i + 2))
+            if i == 0:
+                model.emb_A = lowdim_list[0]
+            else:
+                model.emb_A = emb_total
+            model.emb_B = lowdim_list[i + 1]
+            model.train()
+            model.eval()
+            emb_total = model.data_Aspace
+        if space == "reference":
+            return emb_total
+        elif space == "latent":
+            return model.latent
+        else:
+            raise ValueError("Space should be either 'reference' or 'latent'.")
+    else:
+        for i in range(len(lowdim_list) - 1):
+            print("Integrating the %d-th dataset to the 1-st dataset..." % (i + 2))
+            for lambda_cos in [15.0, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0]:
+                model = Model(lambdacos=lambda_cos,
+                              training_steps=training_steps, 
+                              data_path=os.path.join(data_path, "preprocess"), 
+                              model_path="models/%d_datasets" % (i + 2), 
+                              result_path="results/%d_datasets" % (i + 2))
+                if i == 0:
+                    model.emb_A = lowdim_list[0]
+                else:
+                    model.emb_A = emb_total
+                model.emb_B = lowdim_list[i + 1]
+                model.train()
+                model.eval()
+                meta = pd.DataFrame(index=np.arange(model.emb_A.shape[0] + model.emb_B.shape[0]))
+                meta["method"] = ["A"] * model.emb_A.shape[0] + ["B"] * model.emb_B.shape[0]
+                mixing = calculate_mixing_metric(model.latent, meta, k=5, max_k=300, methods=list(set(meta.method)))
+                print("lambda_cos: %f, mixing metric: %f \n" % (lambda_cos, mixing))
+                if lambda_cos == 15.0:
+                    model_opt = model
+                    mixing_metric_opt = mixing
+                elif mixing < mixing_metric_opt:
+                    model_opt = model
+                    mixing_metric_opt = mixing
+            emb_total = model_opt.data_Aspace
+        if space == "reference":
+            return emb_total
+        elif space == "latent":
+            return model_opt.latent
+        else:
+            raise ValueError("Space should be either 'reference' or 'latent'.")
+
+
+def calculate_mixing_metric(data, meta, methods, k=5, max_k=300, subsample=True):
+    if subsample:
+        if data.shape[0] >= 1e5:
+            np.random.seed(1234)
+            subsample_idx = np.random.choice(data.shape[0], 50000, replace=False)
+            data = data[subsample_idx]
+            meta = meta.iloc[subsample_idx]
+            meta.index = np.arange(len(subsample_idx))
+    lowdim = data
+
+    nbrs = NearestNeighbors(n_neighbors=max_k, algorithm='kd_tree').fit(lowdim)
+    _, indices = nbrs.kneighbors(lowdim)
+    indices = indices[:, 1:]
+    mixing = np.zeros((data.shape[0], 2))
+    for i in range(data.shape[0]):
+        if len(np.where(meta.method[indices[i, :]] == methods[0])[0]) > k-1:
+            mixing[i, 0] = np.where(meta.method[indices[i, :]] == methods[0])[0][k-1]
+        else: mixing[i, 0] = max_k - 1
+        if len(np.where(meta.method[indices[i, :]] == methods[1])[0]) > k-1:
+            mixing[i, 1] = np.where(meta.method[indices[i, :]] == methods[1])[0][k-1]
+        else: mixing[i, 1] = max_k - 1
+    return np.mean(np.median(mixing, axis=1) + 1)
+
+
+def calculate_ARI(data, meta, anno_A="drop_subcluster", anno_B="subcluster"):
+    # np.random.seed(1234)
+    if data.shape[0] > 1e5:
+        np.random.seed(1234)
+        subsample_idx = np.random.choice(data.shape[0], 50000, replace=False)
+        data = data[subsample_idx]
+        meta = meta.iloc[subsample_idx]
+    lowdim = data
+
+    cellid = meta.index.astype(str)
+    method = meta["method"].astype(str)
+    cluster_A = meta[anno_A].astype(str)
+    if (anno_B != anno_A):
+        cluster_B = meta[anno_B].astype(str)
+
+    rpy2.robjects.numpy2ri.activate()
+    nr, nc = lowdim.shape
+    lowdim = ro.r.matrix(lowdim, nrow=nr, ncol=nc)
+    ro.r.assign("data", lowdim)
+    rpy2.robjects.numpy2ri.deactivate()
+
+    cellid = ro.StrVector(cellid)
+    ro.r.assign("cellid", cellid)
+    method = ro.StrVector(method)
+    ro.r.assign("method", method)
+    cluster_A = ro.StrVector(cluster_A)
+    ro.r.assign("cluster_A", cluster_A)
+    if (anno_B != anno_A):
+        cluster_B = ro.StrVector(cluster_B)
+        ro.r.assign("cluster_B", cluster_B)
+
+    ro.r("set.seed(1234)")
+    ro.r['library']("Seurat")
+    ro.r['library']("mclust")
+
+    ro.r("comb_normalized <- t(data)")
+    ro.r('''rownames(comb_normalized) <- paste("gene", 1:nrow(comb_normalized), sep = "")''')
+    ro.r("colnames(comb_normalized) <- as.vector(cellid)")
+
+    ro.r("comb_raw <- matrix(0, nrow = nrow(comb_normalized), ncol = ncol(comb_normalized))")
+    ro.r("rownames(comb_raw) <- rownames(comb_normalized)")
+    ro.r("colnames(comb_raw) <- colnames(comb_normalized)")
+
+    ro.r("comb <- CreateSeuratObject(comb_raw)")
+    ro.r('''scunitdata <- Seurat::CreateDimReducObject(
+                embeddings = t(comb_normalized),
+                stdev = as.numeric(apply(comb_normalized, 2, stats::sd)),
+                assay = "RNA",
+                key = "scunit")''')
+    ro.r('''comb[["scunit"]] <- scunitdata''')
+
+    ro.r("comb@meta.data$method <- method")
+
+    ro.r("comb@meta.data$cluster_A <- cluster_A")
+    if (anno_B != anno_A):
+        ro.r("comb@meta.data$cluster_B <- cluster_B")
+
+    ro.r('''comb <- FindNeighbors(comb, reduction = "scunit", dims = 1:ncol(data), force.recalc = TRUE, verbose = FALSE)''')
+    ro.r('''comb <- FindClusters(comb, verbose = FALSE)''')
+
+    if (anno_B != anno_A):
+        method_set = pd.unique(meta["method"])
+        method_A = method_set[0]
+        ro.r.assign("method_A", method_A)
+        method_B = method_set[1]
+        ro.r.assign("method_B", method_B)
+        ro.r('''indx_A <- which(comb$method == method_A)''')
+        ro.r('''indx_B <- which(comb$method == method_B)''')
+
+        ro.r("ARI_A <- adjustedRandIndex(comb$cluster_A[indx_A], comb$seurat_clusters[indx_A])")
+        ro.r("ARI_B <- adjustedRandIndex(comb$cluster_B[indx_B], comb$seurat_clusters[indx_B])")
+        ARI_A = np.array(ro.r("ARI_A"))[0]
+        ARI_B = np.array(ro.r("ARI_B"))[0]
+
+        return ARI_A, ARI_B
+    else:
+        ro.r("ARI_A <- adjustedRandIndex(comb$cluster_A, comb$seurat_clusters)")
+        ARI_A = np.array(ro.r("ARI_A"))[0]
+
+        return ARI_A
+
+
+def calculate_kBET(data, meta):
+    cellid = meta.index.astype(str)
+    method = meta["method"].astype(str)
+
+    rpy2.robjects.numpy2ri.activate()
+    nr, nc = data.shape
+    data = ro.r.matrix(data, nrow=nr, ncol=nc)
+    ro.r.assign("data", data)
+    rpy2.robjects.numpy2ri.deactivate()
+
+    cellid = ro.StrVector(cellid)
+    ro.r.assign("cellid", cellid)
+    method = ro.StrVector(method)
+    ro.r.assign("method", method)
+
+    ro.r("set.seed(1234)")
+    ro.r['library']("kBET")
+
+    accept_rate = []
+    for _ in range(100):
+        ro.r("subset_id <- sample.int(n = length(method), size = 1000, replace=FALSE)")
+
+        ro.r("batch.estimate <- kBET(data[subset_id,], method[subset_id], do.pca = FALSE, plot=FALSE)")
+        accept_rate.append(np.array(ro.r("mean(batch.estimate$results$kBET.pvalue.test > 0.05)")))
+
+    return np.median(accept_rate)
+
+
+def calculate_ASW(data, meta, anno_A="drop_subcluster", anno_B="subcluster"):
+    if data.shape[0] >= 1e5:
+        np.random.seed(1234)
+        subsample_idx = np.random.choice(data.shape[0], 50000, replace=False)
+        data = data[subsample_idx]
+        meta = meta.iloc[subsample_idx]
+    lowdim = data
+
+    cellid = meta.index.astype(str)
+    method = meta["method"].astype(str)
+    cluster_A = meta[anno_A].astype(str)
+    if (anno_B != anno_A):
+        cluster_B = meta[anno_B].astype(str)
+
+    rpy2.robjects.numpy2ri.activate()
+    nr, nc = lowdim.shape
+    lowdim = ro.r.matrix(lowdim, nrow=nr, ncol=nc)
+    ro.r.assign("data", lowdim)
+    rpy2.robjects.numpy2ri.deactivate()
+
+    cellid = ro.StrVector(cellid)
+    ro.r.assign("cellid", cellid)
+    method = ro.StrVector(method)
+    ro.r.assign("method", method)
+    cluster_A = ro.StrVector(cluster_A)
+    ro.r.assign("cluster_A", cluster_A)
+    if (anno_B != anno_A):
+        cluster_B = ro.StrVector(cluster_B)
+        ro.r.assign("cluster_B", cluster_B)
+
+    ro.r("set.seed(1234)")
+    ro.r['library']("cluster")
+
+    if (anno_B != anno_A):
+        method_set = pd.unique(meta["method"])
+        method_A = method_set[0]
+        ro.r.assign("method_A", method_A)
+        method_B = method_set[1]
+        ro.r.assign("method_B", method_B)
+        ro.r('''indx_A <- which(method == method_A)''')
+        ro.r('''indx_B <- which(method == method_B)''')
+        ro.r('''ASW_A <- summary(silhouette(as.numeric(as.factor(cluster_A[indx_A])), dist(data[indx_A, 1:20])))[["avg.width"]]''')
+        ro.r('''ASW_B <- summary(silhouette(as.numeric(as.factor(cluster_B[indx_B])), dist(data[indx_B, 1:20])))[["avg.width"]]''')
+        ASW_A = np.array(ro.r("ASW_A"))[0]
+        ASW_B = np.array(ro.r("ASW_B"))[0]
+
+        return ASW_A, ASW_B
+    else:
+        ro.r('''ASW_A <- summary(silhouette(as.numeric(as.factor(cluster_A)), dist(data[, 1:20])))[["avg.width"]]''')
+        ASW_A = np.array(ro.r("ASW_A"))[0]
+
+        return ASW_A
+
+
+def plot_UMAP(data, meta, space="latent", score=None, colors=["method"], subsample=False,
+              save=False, result_path=None, filename_suffix=None):
+    if filename_suffix is not None:
+        filenames = [os.path.join(result_path, "%s-%s-%s.pdf" % (space, c, filename_suffix)) for c in colors]
+    else:
+        filenames = [os.path.join(result_path, "%s-%s.pdf" % (space, c)) for c in colors]
+
+    if subsample:
+        if data.shape[0] >= 1e5:
+            np.random.seed(1234)
+            subsample_idx = np.random.choice(data.shape[0], 50000, replace=False)
+            data = data[subsample_idx]
+            meta = meta.iloc[subsample_idx]
+            if score is not None:
+                score = score[subsample_idx]
+    
+    adata = anndata.AnnData(X=data)
+    adata.obs.index = meta.index
+    adata.obs = pd.concat([adata.obs, meta], axis=1)
+    adata.var.index = "dim-" + adata.var.index
+    adata.obsm["latent"] = data
+    
+    # run UMAP
+    reducer = umap.UMAP(n_neighbors=30,
+                        n_components=2,
+                        metric="correlation",
+                        n_epochs=None,
+                        learning_rate=1.0,
+                        min_dist=0.3,
+                        spread=1.0,
+                        set_op_mix_ratio=1.0,
+                        local_connectivity=1,
+                        repulsion_strength=1,
+                        negative_sample_rate=5,
+                        a=None,
+                        b=None,
+                        random_state=1234,
+                        metric_kwds=None,
+                        angular_rp_forest=False,
+                        verbose=True)
+    embedding = reducer.fit_transform(adata.obsm["latent"])
+    adata.obsm["X_umap"] = embedding
+
+    n_cells = embedding.shape[0]
+    if n_cells >= 10000:
+        size = 120000 / n_cells
+    else:
+        size = 12
+
+    for i, c in enumerate(colors):
+        groups = sorted(set(adata.obs[c].astype(str)))
+        if "nan" in groups:
+            groups.remove("nan")
+        palette = "rainbow"
+        if save:
+            fig = sc.pl.umap(adata, color=c, palette=palette, groups=groups, return_fig=True, size=size)
+            fig.savefig(filenames[i], bbox_inches='tight', dpi=300)
+        else:
+            sc.pl.umap(adata, color=c, palette=palette, groups=groups, size=size)
+
+    if space == "Aspace":
+        method_set = pd.unique(meta["method"])
+        adata.obs["score"] = score
+        adata.obs["margin"] = (score < -5.0) * 1
+        fig = sc.pl.umap(adata[meta["method"]==method_set[1]], color="score", palette=palette, groups=groups, return_fig=True, size=size)
+        fig.savefig(os.path.join(result_path, "%s-score.pdf" % space), bbox_inches='tight', dpi=300)
+        fig = sc.pl.umap(adata[meta["method"]==method_set[1]], color="margin", palette=palette, groups=groups, return_fig=True, size=size)
+        fig.savefig(os.path.join(result_path, "%s-margin.pdf" % space), bbox_inches='tight', dpi=300)
+    if space == "Bspace":
+        method_set = pd.unique(meta["method"])
+        adata.obs["score"] = score
+        adata.obs["margin"] = (score < -5.0) * 1
+        fig = sc.pl.umap(adata[meta["method"]==method_set[0]], color="score", palette=palette, groups=groups, return_fig=True, size=size)
+        fig.savefig(os.path.join(result_path, "%s-score.pdf" % space), bbox_inches='tight', dpi=300)
+        fig = sc.pl.umap(adata[meta["method"]==method_set[0]], color="margin", palette=palette, groups=groups, return_fig=True, size=size)
+        fig.savefig(os.path.join(result_path, "%s-margin.pdf" % space), bbox_inches='tight', dpi=300)
+