diff --git a/Cell_BLAST/__init__.py b/Cell_BLAST/__init__.py index 56771b6..146a46e 100644 --- a/Cell_BLAST/__init__.py +++ b/Cell_BLAST/__init__.py @@ -30,4 +30,4 @@ "config" ] -__version__ = "0.1.1" +__version__ = "0.1.2" diff --git a/Cell_BLAST/data.py b/Cell_BLAST/data.py index a9bc679..d232d29 100644 --- a/Cell_BLAST/data.py +++ b/Cell_BLAST/data.py @@ -294,6 +294,16 @@ def __getitem__(self, slices): uns=copy.deepcopy(dict(self.uns)) ) + def clean_duplicate_vars(self): + unique_vars, duplicate_mask = \ + set(), np.ones(self.var_names.size).astype(np.bool_) + for idx, item in enumerate(self.var_names): + if item in unique_vars: + duplicate_mask[idx] = False + else: + unique_vars.add(item) + return self[:, duplicate_mask] + def get_meta_or_var(self, names, normalize_var=False, log_var=False): """ Get either meta information (column names in ``obs``) or @@ -796,7 +806,7 @@ def obs_correlation_heatmap( def violin( self, group, var, normalize_var=True, width=7, height=7, - ax=None, **kwargs + ax=None, strip_kws=None, violin_kws=None ): """ Violin plot across obs groups. @@ -816,7 +826,10 @@ def violin( ax : matplotlib.axes.Axes Specify an existing axes to plot onto, by default None. If specified, ``width`` and ``height`` take no effect. - **kwargs + strip_kws : dict + Additional keyword arguments will be passed to + ``seaborn.stripplot``. + violin_kws : dict Additional keyword arguments will be passed to ``seaborn.violinplot``. @@ -828,15 +841,22 @@ def violin( import matplotlib.pyplot as plt import seaborn as sns + strip_kws = {} if strip_kws is None else strip_kws + violin_kws = {} if violin_kws is None else violin_kws + df = self.get_meta_or_var( [group, var], normalize_var=normalize_var, log_var=True ) if ax is None: _, ax = plt.subplots(figsize=(width, height)) + ax = sns.stripplot( + x=group, y=var, data=df, + color=".3", edgecolor=None, size=3, ax=ax, **strip_kws + ) ax = sns.violinplot( x=group, y=var, data=df, - scale="width", ax=ax, inner="point", **kwargs + scale="width", ax=ax, inner=None, **violin_kws ) ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) diff --git a/Datasets/ACA_datasets.csv b/Datasets/ACA_datasets.csv index c1cd177..3643086 100644 --- a/Datasets/ACA_datasets.csv +++ b/Datasets/ACA_datasets.csv @@ -10,9 +10,9 @@ Dahlin_10x,Mus musculus,Bone Marrow,NA,10x,46447,A single-cell hematopoietic lan Dahlin_mutant,Mus musculus,Bone Marrow,NA,10x,14675,A single-cell hematopoietic landscape resolves 8 lineage trajectories and defects in Kit mutant mice,TRUE,c-Kit mutant,collect_dahlin.R Quake_10x_Bone_Marrow,Mus musculus,Bone Marrow,,10x,3652,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_10x.R Quake_Smart-seq2_Bone_Marrow,Mus musculus,Bone Marrow,,Smart-seq2,5037,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_smartseq2.R -Tusi,Mus musculus,Bone Marrow,adult,inDrop,4763,Population snapshots predict early haematopoietic and erythroid hierarchies,FALSE,Continuous,collect_tusi.R -Velten_QUARTZ-seq,Homo sapiens,Bone Marrow,29-year-old,QUARTZ-seq,379,Human haematopoietic stem cell lineage commitment is a continuous process,FALSE,"no meta, continuous",collect_velten.R -Velten_Smart-seq2,Homo sapiens,Bone Marrow,25-year-old,Smart-seq2,1035,Human haematopoietic stem cell lineage commitment is a continuous process,FALSE,"no meta, continuous",collect_velten.R +Tusi,Mus musculus,Bone Marrow,adult,inDrop,4763,Population snapshots predict early haematopoietic and erythroid hierarchies,TRUE,Continuous,collect_tusi.R +Velten_QUARTZ-seq,Homo sapiens,Bone Marrow,29-year-old,QUARTZ-seq,379,Human haematopoietic stem cell lineage commitment is a continuous process,TRUE,"no meta, continuous",collect_velten.R +Velten_Smart-seq2,Homo sapiens,Bone Marrow,25-year-old,Smart-seq2,1035,Human haematopoietic stem cell lineage commitment is a continuous process,TRUE,"no meta, continuous",collect_velten.R Campbell,Mus musculus,Brain,,Drop-seq,20921,A molecular census of arcuate hypothalamus and median eminence cell types,TRUE,Adult Arc-ME complex,collect_campbell.R Chen,Mus musculus,Brain,,Drop-seq,12089,Single-Cell RNA-Seq Reveals Hypothalamic Cell Diversity,TRUE,Adult hypothalamus,collect_chen.R Lake_2018,Homo sapiens,Brain,,snDrop-seq,35289,Integrative single-cell analysis of transcriptional and epigenetic states in the human adult brain,TRUE,,collect_lake_2018.R @@ -85,4 +85,4 @@ Montoro_10x,Mus musculus,Trachea,adult,10x,7193,A revised airway epithelial hier Montoro_Smart-seq2,Mus musculus,Trachea,adult,modified Smart-seq2,301,A revised airway epithelial hierarchy includes CFTR-expressing ionocytes,TRUE,3 WT,collect_montoro_smartseq2.R Plasschaert,Mus musculus,Trachea,adult,inDrop,6977,A single-cell atlas of the airway epithelium reveals the CFTR-rich pulmonary ionocyte,TRUE,4 WT,collect_plasschaert.R Quake_10x_Trachea,Mus musculus,Trachea,,10x,11269,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_10x.R -Quake_Smart-seq2_Trachea,Mus musculus,Trachea,,Smart-seq2,1350,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_smartseq2.R +Quake_Smart-seq2_Trachea,Mus musculus,Trachea,,Smart-seq2,1350,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_smartseq2.R \ No newline at end of file diff --git a/Datasets/aligned_datasets.csv b/Datasets/aligned_datasets.csv index d56de6e..a449ae0 100644 --- a/Datasets/aligned_datasets.csv +++ b/Datasets/aligned_datasets.csv @@ -15,7 +15,8 @@ ALIGNED_Mus_musculus_Pancreas,Mus musculus,Pancreas,,,,,,"Baron_mouse, Quake_Sma ALIGNED_Mus_musculus_Retina,Mus musculus,Retina,,,,,,"Macosko, Shekhar" ALIGNED_Mus_musculus_Small_Intestine,Mus musculus,Small Intestine,,,,,,"Haber_10x, Haber_10x_largecell, Haber_10x_region, Haber_10x_FAE, Haber_Smart-seq2" ALIGNED_Mus_musculus_Spleen,Mus musculus,Spleen,,,,,,"Quake_10x_Spleen, Quake_Smart-seq2_Spleen" +ALIGNED_Mus_musculus_Heart_and_Aorta,Mus musculus,"Heart, Aorta",,,,,,"Quake_10x_Heart_and_Aorta, Quake_Smart-seq2_Heart" ALIGNED_Mus_musculus_Thymus,Mus musculus,Thymus,,,,,,"Quake_10x_Thymus, Quake_Smart-seq2_Thymus" ALIGNED_Mus_musculus_Tongue,Mus musculus,Tongue,,,,,,"Quake_10x_Tongue, Quake_Smart-seq2_Tongue" ALIGNED_Mus_musculus_Trachea,Mus musculus,Trachea,,,,,,"Quake_10x_Trachea, Quake_Smart-seq2_Trachea" -ALIGNED_Tabula_Muris,Mus musculus,Trachea,,,,,,"Quake_10x, Quake_Smart-seq2" +ALIGNED_Tabula_Muris,Mus musculus,Atlas,,,,,,"Quake_10x, Quake_Smart-seq2" diff --git a/Datasets/collect/collect_ariss.R b/Datasets/collect/collect_ariss.R new file mode 100644 index 0000000..1f64ff0 --- /dev/null +++ b/Datasets/collect/collect_ariss.R @@ -0,0 +1,70 @@ +#! /usr/bin/env Rscript +# by wangshuai +# 11 Mar 2019 +# 14:36:35 PM + +suppressPackageStartupMessages({ + library(Seurat) +}) +source("../../Utilities/data.R", chdir = TRUE) + +#READ label file +cat("Reading label file...\n") +metadata1 <- read.table("../download/Ariss/wt_Rbf_and_populations.txt",header=T,stringsAsFactors = F) +row.names(metadata1) <- metadata1[,'CellName'] +metadata1 <- metadata1[,c('Genotype','cell_type1')] + +metadata2 <- read.table("../download/Ariss/Cells_and_population.txt",header=T,row.names=1,stringsAsFactors = F) +metadata2$Genotype <- 'wt' + +includedcells<-union(row.names(metadata1),row.names(metadata2)) +metadata <- rbind(metadata2,metadata1) +metadata <- metadata[which(row.names(metadata) %in% includedcells),] + +celltypes <- read.csv('../download/celltypes',sep='\t') + +metadata$lifestage <- 'third instar larva stage' +metadata$organ <- 'eye disc' +metadata$race <- 'Drosophila melanogaster' + +#READ DGE +cat("Reading DGE\n") +path <- "../download/Ariss/GSE115476" +fileNames <- dir(path) +filePath <- sapply(fileNames, function(x){ + paste(path,x,sep='/')}) +data <- lapply(filePath, function(x){ + read.table(x, header=T,stringsAsFactors = F)}) + +i <- 1 +for (name in names(data)){ + perfix<-substr(name,gregexpr(pattern = '_',text = name)[[1]]+1,gregexpr(pattern = "\\.",text = name)[[1]]-1) + colnames(data[[i]]) <- lapply(colnames(data[[i]]),function(x){ + paste(perfix,x,sep='_')}) + genes <- data[[i]][,1] + included_cells <- intersect(rownames(metadata), colnames(data[[i]])) + data[[i]] <- data.frame(genes,data[[i]][, included_cells]) + i <- i+1 +} + +expmerge <- Reduce(function(x,y) merge(x,y,by=1,all=T),data) +row.names(expmerge)<-expmerge[,1] +expmerge<-expmerge[,-1] +included_cells <- intersect(rownames(metadata), colnames(expmerge)) +metadata <- metadata[included_cells, ] +expmerge <- expmerge[, included_cells] +expmerge[is.na(expmerge)]<-0 + +expressed_genes <- rownames(expmerge)[rowSums(expmerge > 1) > 5] +expmerge <- Matrix(as.matrix(expmerge),sparse = T) + +message("Constructing dataset...") +dataset <- new("ExprDataSet", + exprs = expmerge, obs = metadata, + var = data.frame(row.names = rownames(expmerge)), + uns = list(expressed_genes = expressed_genes) +) + +message("Saving data...") +write_dataset(dataset, "../data/Ariss/data.h5") +cat("Done!\n") diff --git a/Datasets/collect/collect_tusi.R b/Datasets/collect/collect_tusi.R index 25dc398..1e5fcb6 100755 --- a/Datasets/collect/collect_tusi.R +++ b/Datasets/collect/collect_tusi.R @@ -27,10 +27,15 @@ colnames(potential) <- "potential" meta_df <- Reduce(cbind, list(meta_df, fate, potential)) rownames(meta_df) <- meta_df$cell_id meta_df$cell_id <- NULL +meta_df$cell_type1 = "HSPC" expr_mat <- expr_mat[rownames(meta_df), ] +#assign cell ontology +cell_ontology <- read.csv("../cell_ontology/bone_marrow_cell_ontology.csv") +cell_ontology <- cell_ontology[, c("cell_type1", "cell_ontology_class", "cell_ontology_id")] + #datasets_meta datasets_meta <- read.csv("../ACA_datasets.csv", header = TRUE, row.names = 1) -construct_dataset("../data/Tusi", t(expr_mat), meta_df, datasets_meta, grouping = "batch") +construct_dataset("../data/Tusi", t(expr_mat), meta_df, datasets_meta, cell_ontology, grouping = "batch") message("Done!") diff --git a/doc/_static/BLAST.html b/doc/_static/BLAST.html new file mode 100644 index 0000000..d1f4a9a --- /dev/null +++ b/doc/_static/BLAST.html @@ -0,0 +1,12222 @@ + + + +BLAST + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+

Cell BLAST tutorial

+
+
+
+
+
+
In [1]:
+
+
+
import time
+import warnings
+warnings.filterwarnings("ignore")
+
+import tensorflow as tf
+tf.logging.set_verbosity(0)
+
+import Cell_BLAST as cb
+cb.config.N_JOBS = 4
+cb.config.RANDOM_SEED = 0
+
+ +
+
+
+ +
+
+
+
+
+

In this tutorial, we demonstrate how to perform cell BLAST based on DIRECTi models.

+

Again, we use the pancreas datasets as an example.

+ +
+
+
+
+
+
In [2]:
+
+
+
baron_human = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Baron_human/data.h5").normalize()
+
+ +
+
+
+ +
+
+
+
+
+

Preparing database

Cell BLAST uses multiple models to increase specificity.

+

Here we first train 4 DIRECTi models, each with a different random seed. Maximal epoch number is set to 50 to save time.

+ +
+
+
+
+
+
In [3]:
+
+
+
%%capture
+models = []
+for i in range(4):
+    models.append(cb.directi.fit_DIRECTi(
+        baron_human, baron_human.uns["seurat_genes"], latent_dim=10, cat_dim=20,
+        epoch=50, patience=10, random_seed=i,
+        path="/tmp/cb/examples/baron_human_blast_models/model_%d" % i
+    ))
+
+ +
+
+
+ +
+
+
+
+
+

Then we build a cell BLAST "database" by feeding our previously trained models and the reference dataset. The chained method build_empirical enables empirical p-values in downstream analyses.

+ +
+
+
+
+
+
In [4]:
+
+
+
blast = cb.blast.BLAST(models, baron_human).build_empirical()
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Projecting to latent space...
+[  Info   ] Fitting nearest neighbor trees...
+[  Info   ] Computing posterior distribution distances...
+
+
+
+ +
+
+ +
+
+
+
+
+

Like DIRECTi models, BLAST objects can be easily saved and loaded.

+ +
+
+
+
+
+
In [5]:
+
+
+
blast.save("./baron_human_blast")
+del blast
+blast = cb.blast.BLAST.load("./baron_human_blast")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Loading latent module weights...
+[  Info   ] Loading prob module weights...
+[  Info   ] Loading rmbatch module weights...
+[  Info   ] Loading latent module weights...
+[  Info   ] Loading prob module weights...
+[  Info   ] Loading rmbatch module weights...
+[  Info   ] Loading latent module weights...
+[  Info   ] Loading prob module weights...
+[  Info   ] Loading rmbatch module weights...
+[  Info   ] Loading latent module weights...
+[  Info   ] Loading prob module weights...
+[  Info   ] Loading rmbatch module weights...
+
+
+
+ +
+
+ +
+
+
+
+
+

BLAST query

We load another pancreas dataset to demonstrate the querying process.

+ +
+
+
+
+
+
In [6]:
+
+
+
lawlor = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Lawlor/data.h5").normalize()
+
+ +
+
+
+ +
+
+
+
+
+

To query the database, we first use the query method to obtain initial hits in the reference database. This is done by efficient Euclidean distance based nearest neighbor search in the latent space. Nearest neighbors in the latent space of each model will be merged. Though highly efficient, latent space Euclidean distance is not the best metric to determine cell-cell similarity. To obtain better accuracy and specificity, we also compute posterior distribution distances as well as empirical p-values for these nearest neighbors.

+

Then we use reconcile_models to pool together informarion from multiple models and filter the initial hits to obtain significant hits.

+ +
+
+
+
+
+
In [7]:
+
+
+
start_time = time.time()
+lawlor_hits = blast.query(lawlor).reconcile_models().filter(by="pval", cutoff=0.05)
+print("Querying time: %.3f ms/cell" % (
+    (time.time() - start_time) * 1000 / len(lawlor_hits)
+))
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Projecting to latent space...
+[ Warning ] 22 out of 769 variables are not found, will be set to zero![ Warning ] 22 out of 769 variables are not found, will be set to zero!
+[ Warning ] 22 out of 769 variables are not found, will be set to zero!
+
+[ Warning ] 22 out of 769 variables are not found, will be set to zero!
+[ Warning ] 22 out of 769 variables are not found, will be set to zero!
+[ Warning ] 22 out of 769 variables are not found, will be set to zero!
+[ Warning ] 22 out of 769 variables are not found, will be set to zero!
+[ Warning ] 22 out of 769 variables are not found, will be set to zero!
+[  Info   ] Doing nearest neighbor search...
+[  Info   ] Merging hits across models...
+[  Info   ] Computing posterior distribution distances...
+[  Info   ] Computing empirical p-values...
+Querying time: 21.046 ms/cell
+
+
+
+ +
+
+ +
+
+
+
+
+

Detailed information can be checked by printing the hits object.

+ +
+
+
+
+
+
In [8]:
+
+
+
print(lawlor_hits[0:1])
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
>>> 1st-61_S27:
+                                cell_ontology_class cell_ontology_id  \
+human2_lib3.final_cell_0543  type B pancreatic cell       CL:0000169   
+human1_lib1.final_cell_0300  type B pancreatic cell       CL:0000169   
+human2_lib2.final_cell_0458  type B pancreatic cell       CL:0000169   
+human2_lib2.final_cell_0509  type B pancreatic cell       CL:0000169   
+human2_lib2.final_cell_0334  type B pancreatic cell       CL:0000169   
+human4_lib3.final_cell_0638  type B pancreatic cell       CL:0000169   
+human4_lib3.final_cell_0605  type B pancreatic cell       CL:0000169   
+human2_lib3.final_cell_0436  type B pancreatic cell       CL:0000169   
+human2_lib2.final_cell_0584  type B pancreatic cell       CL:0000169   
+human1_lib1.final_cell_0429  type B pancreatic cell       CL:0000169   
+human1_lib1.final_cell_0107  type B pancreatic cell       CL:0000169   
+human3_lib2.final_cell_0573  type B pancreatic cell       CL:0000169   
+human3_lib2.final_cell_0726  type B pancreatic cell       CL:0000169   
+human2_lib1.final_cell_0466  type B pancreatic cell       CL:0000169   
+human4_lib3.final_cell_0675  type B pancreatic cell       CL:0000169   
+human2_lib3.final_cell_0322  type B pancreatic cell       CL:0000169   
+human3_lib2.final_cell_0821  type B pancreatic cell       CL:0000169   
+
+                            cell_type1 dataset_name   donor library     organ  \
+human2_lib3.final_cell_0543       beta  Baron_human  human2    lib3  Pancreas   
+human1_lib1.final_cell_0300       beta  Baron_human  human1    lib1  Pancreas   
+human2_lib2.final_cell_0458       beta  Baron_human  human2    lib2  Pancreas   
+human2_lib2.final_cell_0509       beta  Baron_human  human2    lib2  Pancreas   
+human2_lib2.final_cell_0334       beta  Baron_human  human2    lib2  Pancreas   
+human4_lib3.final_cell_0638       beta  Baron_human  human4    lib3  Pancreas   
+human4_lib3.final_cell_0605       beta  Baron_human  human4    lib3  Pancreas   
+human2_lib3.final_cell_0436       beta  Baron_human  human2    lib3  Pancreas   
+human2_lib2.final_cell_0584       beta  Baron_human  human2    lib2  Pancreas   
+human1_lib1.final_cell_0429       beta  Baron_human  human1    lib1  Pancreas   
+human1_lib1.final_cell_0107       beta  Baron_human  human1    lib1  Pancreas   
+human3_lib2.final_cell_0573       beta  Baron_human  human3    lib2  Pancreas   
+human3_lib2.final_cell_0726       beta  Baron_human  human3    lib2  Pancreas   
+human2_lib1.final_cell_0466       beta  Baron_human  human2    lib1  Pancreas   
+human4_lib3.final_cell_0675       beta  Baron_human  human4    lib3  Pancreas   
+human2_lib3.final_cell_0322       beta  Baron_human  human2    lib3  Pancreas   
+human3_lib2.final_cell_0821       beta  Baron_human  human3    lib2  Pancreas   
+
+                                 organism platform   distance   p-value  
+human2_lib3.final_cell_0543  Homo sapiens   inDrop  10.779652  0.002462  
+human1_lib1.final_cell_0300  Homo sapiens   inDrop  10.989110  0.009304  
+human2_lib2.final_cell_0458  Homo sapiens   inDrop  11.517597  0.011572  
+human2_lib2.final_cell_0509  Homo sapiens   inDrop  11.824542  0.012801  
+human2_lib2.final_cell_0334  Homo sapiens   inDrop  12.293741  0.013508  
+human4_lib3.final_cell_0638  Homo sapiens   inDrop  12.972761  0.017467  
+human4_lib3.final_cell_0605  Homo sapiens   inDrop  13.321820  0.018960  
+human2_lib3.final_cell_0436  Homo sapiens   inDrop  13.310701  0.019880  
+human2_lib2.final_cell_0584  Homo sapiens   inDrop  13.707359  0.021922  
+human1_lib1.final_cell_0429  Homo sapiens   inDrop  14.626563  0.025689  
+human1_lib1.final_cell_0107  Homo sapiens   inDrop  14.503482  0.025698  
+human3_lib2.final_cell_0573  Homo sapiens   inDrop  16.044521  0.027302  
+human3_lib2.final_cell_0726  Homo sapiens   inDrop  15.000503  0.029080  
+human2_lib1.final_cell_0466  Homo sapiens   inDrop  15.392393  0.029155  
+human4_lib3.final_cell_0675  Homo sapiens   inDrop  15.540586  0.032070  
+human2_lib3.final_cell_0322  Homo sapiens   inDrop  15.806624  0.033552  
+human3_lib2.final_cell_0821  Homo sapiens   inDrop  16.375327  0.038331  
+
+
+
+
+
+ +
+
+ +
+
+
+
+
+

Finally, we can use annotate method to obtain cell type predictions.

+ +
+
+
+
+
+
In [9]:
+
+
+
lawlor_predictions = lawlor_hits.annotate("cell_ontology_class")
+
+ +
+
+
+ +
+
+
+
+
+

Cell type prediction can be compared with author provided annotation via Sankey diagrams. We see that for the Lawlor query, cell type predictions are very accurate.

+ +
+
+
+
+
+
In [10]:
+
+
+
fig = cb.blast.sankey(
+    lawlor.obs["cell_ontology_class"].values,
+    lawlor_predictions.values.ravel(),
+    title="Lawlor to Baron_human", tint_cutoff=3
+)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + + +
+ +
+ +
+ +
+ +
+ + + +
+
+
+ +
+ +
+
+ +
+
+
+ + + + + + diff --git a/doc/_static/DIRECTi.html b/doc/_static/DIRECTi.html new file mode 100644 index 0000000..1f363fa --- /dev/null +++ b/doc/_static/DIRECTi.html @@ -0,0 +1,12289 @@ + + + +DIRECTi + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+
+

DIRECTi tutorial

+
+
+
+
+
+
In [1]:
+
+
+
import time
+import warnings
+warnings.filterwarnings("ignore")
+
+import tensorflow as tf
+tf.logging.set_verbosity(0)
+
+import Cell_BLAST as cb
+cb.config.N_JOBS = 4
+cb.config.RANDOM_SEED = 0
+
+ +
+
+
+ +
+
+
+
+
+

Unsupervised dimension reduction

Let's first load a dataset (Baron, M. et al. Cell Syst, 2016), which profiles >8,000 human pancreatic islet cells.

+

Here we normalized the dataset so that the resulting model, once fitted, can be used to project other datasets normalized in the same way.

+

Theoretically speaking, read count distribution may deviate from negative binomial due to the scaling, but practically it still fits very well.

+
+ +
+
+
+
+
+
In [2]:
+
+
+
baron_human = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Baron_human/data.h5").normalize()
+
+ +
+
+
+ +
+
+
+
+
+

Now we build and fit a DIRECTi model with the one-step fit_DIRECTi function.

+

We set latent space dimensionality to 10, and go with 20 intrinsic clusters.

+

To save time, we only train for 50 epochs. The model has not fully converged but already works well.

+ +
+
+
+
+
+
In [3]:
+
+
+
%%capture
+model = cb.directi.fit_DIRECTi(
+    baron_human, baron_human.uns["seurat_genes"],
+    latent_dim=10, cat_dim=20, epoch=50,
+    path="./baron_human_model"
+)
+
+ +
+
+
+ +
+
+
+
+
+

We can project cells into the low dimensional latent space using the inference method. +It is recommended that you store the returned latent space into the latent slot of the original dataset object, which facilitates visualization.

+ +
+
+
+
+
+
In [4]:
+
+
+
baron_human.latent = model.inference(baron_human)
+
+ +
+
+
+ +
+
+
+
In [6]:
+
+
+
ax = baron_human.visualize_latent("cell_ontology_class")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Using cached tSNE...
+
+
+
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
+
+

We see that different cell types can readily be distinguished.

+

Note that though 20-dimensional categorical latent was used, far less clusters form in the latent space. This is because the model is flexible to discard categories or to use multiple categories to represent the same cluster if a redundant number of categories is specified.

+

You can also save the model for future use. It is straightforward to load a saved model.

+ +
+
+
+
+
+
In [7]:
+
+
+
model.save()
+model.close()
+del model
+model = cb.directi.DIRECTi.load("./baron_human_model")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Loading latent module weights...
+[  Info   ] Loading prob module weights...
+[  Info   ] Loading rmbatch module weights...
+
+
+
+ +
+
+ +
+
+
+
+
+

We can also project other datasets with the same model. Here we test with the Muraro dataset (Muraro, M. et al. Cell Systems, 2016)

+

There will be a warning saying that we have some genes missing in the new dataset, but it doesn't really matter. Distinct cell types are still well separated.

+ +
+
+
+
+
+
In [8]:
+
+
+
muraro = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Muraro/data.h5").normalize()
+muraro.latent = model.inference(muraro)
+ax = muraro.visualize_latent("cell_ontology_class")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[ Warning ] 18 out of 769 variables are not found, will be set to zero!
+[  Info   ] Computing tSNE...
+
+
+
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
+
+

Systematical bias / batch effect removal

If we train on a "meta" dataset merged from multiple datasets, we'll find significant systematical bias among them.

+ +
+
+
+
+
+
In [9]:
+
+
+
combined_dataset = cb.data.ExprDataSet.merge_datasets({
+    "Baron_human": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Baron_human/data.h5"),
+    "Segerstolpe": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Segerstolpe/data.h5"),
+    "Muraro": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Muraro/data.h5"),
+    "Xin": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Xin_2016/data.h5"),
+    "Lawlor": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Lawlor/data.h5")
+}, meta_col="study", merge_uns_slots=["seurat_genes"]).normalize()
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[ Warning ] 22569 out of 42694 variables are not found, will be set to zero!
+[ Warning ] 17241 out of 42694 variables are not found, will be set to zero!
+[ Warning ] 23648 out of 42694 variables are not found, will be set to zero!
+[ Warning ] 2843 out of 42694 variables are not found, will be set to zero!
+[ Warning ] 22809 out of 42694 variables are not found, will be set to zero!
+[  Info   ] Merging uns slots...
+[  Info   ] Merging var data frame...
+[  Info   ] Merging obs data frame...
+[  Info   ] Merging expression matrix...
+
+
+
+ +
+
+ +
+
+
+
In [10]:
+
+
+
%%capture
+model = cb.directi.fit_DIRECTi(
+    combined_dataset, combined_dataset.uns["seurat_genes"],
+    latent_dim=10, cat_dim=20, epoch=20,
+    path="/tmp/cb/examples/pancreas_unaligned_model"
+)
+combined_dataset.latent = model.inference(combined_dataset)
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
ax = combined_dataset.visualize_latent("study")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Computing tSNE...
+
+
+
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
+
+

You can remove the systematical bias from the latent space simply by specifying a batch_effect column, which is "study" in this case.

+

This time we train the model for 100 epoches to get better convergence.

+ +
+
+
+
+
+
In [12]:
+
+
+
%%capture
+model_rmbatch = cb.directi.fit_DIRECTi(
+    combined_dataset, combined_dataset.uns["seurat_genes"], batch_effect="study", 
+    latent_dim=10, cat_dim=20, epoch=100,
+    path="/tmp/cb/examples/pancreas_aligned_model"
+)
+combined_dataset.latent = model_rmbatch.inference(combined_dataset)
+
+ +
+
+
+ +
+
+
+
+
+

We see that systematical bias is largely removed. Cells of the same cell type from different studies are well aligned.

+ +
+
+
+
+
+
In [13]:
+
+
+
ax = combined_dataset.visualize_latent("study")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Computing tSNE...
+
+
+
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+
In [14]:
+
+
+
ax = combined_dataset.visualize_latent("cell_ontology_class")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[  Info   ] Using cached tSNE...
+
+
+
+ +
+ +
+ + + + +
+ +
+ +
+ +
+
+ +
+
+
+ + + + + + diff --git a/doc/conf.py b/doc/conf.py index 193b54a..347b198 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '0.1.1' +release = '0.1.2' # -- General configuration --------------------------------------------------- diff --git a/doc/start.rst b/doc/start.rst index 6f434ba..f97c89b 100644 --- a/doc/start.rst +++ b/doc/start.rst @@ -13,8 +13,8 @@ Vignettes We provide the following two ipython notebooks to help you start using ``Cell_BLAST`` and ``DIRECTi``. -* `DIRECTi `_ -* `Cell BLAST `_ +* `DIRECTi <_static/DIRECTi.html>`_ +* `Cell BLAST <_static/BLAST.html>`_ Selected Documentations diff --git a/setup.py b/setup.py index aae6aa1..c541bb5 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="Cell_BLAST", - version="0.1.1", + version="0.1.2", author="Zhijie Cao", author_email="caozj@mail.cbi.pku.edu.cn", description="Single-cell transcriptome querying tool",