diff --git a/Cell_BLAST/__init__.py b/Cell_BLAST/__init__.py index 56771b6..146a46e 100644 --- a/Cell_BLAST/__init__.py +++ b/Cell_BLAST/__init__.py @@ -30,4 +30,4 @@ "config" ] -__version__ = "0.1.1" +__version__ = "0.1.2" diff --git a/Cell_BLAST/data.py b/Cell_BLAST/data.py index a9bc679..d232d29 100644 --- a/Cell_BLAST/data.py +++ b/Cell_BLAST/data.py @@ -294,6 +294,16 @@ def __getitem__(self, slices): uns=copy.deepcopy(dict(self.uns)) ) + def clean_duplicate_vars(self): + unique_vars, duplicate_mask = \ + set(), np.ones(self.var_names.size).astype(np.bool_) + for idx, item in enumerate(self.var_names): + if item in unique_vars: + duplicate_mask[idx] = False + else: + unique_vars.add(item) + return self[:, duplicate_mask] + def get_meta_or_var(self, names, normalize_var=False, log_var=False): """ Get either meta information (column names in ``obs``) or @@ -796,7 +806,7 @@ def obs_correlation_heatmap( def violin( self, group, var, normalize_var=True, width=7, height=7, - ax=None, **kwargs + ax=None, strip_kws=None, violin_kws=None ): """ Violin plot across obs groups. @@ -816,7 +826,10 @@ def violin( ax : matplotlib.axes.Axes Specify an existing axes to plot onto, by default None. If specified, ``width`` and ``height`` take no effect. - **kwargs + strip_kws : dict + Additional keyword arguments will be passed to + ``seaborn.stripplot``. + violin_kws : dict Additional keyword arguments will be passed to ``seaborn.violinplot``. @@ -828,15 +841,22 @@ def violin( import matplotlib.pyplot as plt import seaborn as sns + strip_kws = {} if strip_kws is None else strip_kws + violin_kws = {} if violin_kws is None else violin_kws + df = self.get_meta_or_var( [group, var], normalize_var=normalize_var, log_var=True ) if ax is None: _, ax = plt.subplots(figsize=(width, height)) + ax = sns.stripplot( + x=group, y=var, data=df, + color=".3", edgecolor=None, size=3, ax=ax, **strip_kws + ) ax = sns.violinplot( x=group, y=var, data=df, - scale="width", ax=ax, inner="point", **kwargs + scale="width", ax=ax, inner=None, **violin_kws ) ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) diff --git a/Datasets/ACA_datasets.csv b/Datasets/ACA_datasets.csv index c1cd177..3643086 100644 --- a/Datasets/ACA_datasets.csv +++ b/Datasets/ACA_datasets.csv @@ -10,9 +10,9 @@ Dahlin_10x,Mus musculus,Bone Marrow,NA,10x,46447,A single-cell hematopoietic lan Dahlin_mutant,Mus musculus,Bone Marrow,NA,10x,14675,A single-cell hematopoietic landscape resolves 8 lineage trajectories and defects in Kit mutant mice,TRUE,c-Kit mutant,collect_dahlin.R Quake_10x_Bone_Marrow,Mus musculus,Bone Marrow,,10x,3652,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_10x.R Quake_Smart-seq2_Bone_Marrow,Mus musculus,Bone Marrow,,Smart-seq2,5037,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_smartseq2.R -Tusi,Mus musculus,Bone Marrow,adult,inDrop,4763,Population snapshots predict early haematopoietic and erythroid hierarchies,FALSE,Continuous,collect_tusi.R -Velten_QUARTZ-seq,Homo sapiens,Bone Marrow,29-year-old,QUARTZ-seq,379,Human haematopoietic stem cell lineage commitment is a continuous process,FALSE,"no meta, continuous",collect_velten.R -Velten_Smart-seq2,Homo sapiens,Bone Marrow,25-year-old,Smart-seq2,1035,Human haematopoietic stem cell lineage commitment is a continuous process,FALSE,"no meta, continuous",collect_velten.R +Tusi,Mus musculus,Bone Marrow,adult,inDrop,4763,Population snapshots predict early haematopoietic and erythroid hierarchies,TRUE,Continuous,collect_tusi.R +Velten_QUARTZ-seq,Homo sapiens,Bone Marrow,29-year-old,QUARTZ-seq,379,Human haematopoietic stem cell lineage commitment is a continuous process,TRUE,"no meta, continuous",collect_velten.R +Velten_Smart-seq2,Homo sapiens,Bone Marrow,25-year-old,Smart-seq2,1035,Human haematopoietic stem cell lineage commitment is a continuous process,TRUE,"no meta, continuous",collect_velten.R Campbell,Mus musculus,Brain,,Drop-seq,20921,A molecular census of arcuate hypothalamus and median eminence cell types,TRUE,Adult Arc-ME complex,collect_campbell.R Chen,Mus musculus,Brain,,Drop-seq,12089,Single-Cell RNA-Seq Reveals Hypothalamic Cell Diversity,TRUE,Adult hypothalamus,collect_chen.R Lake_2018,Homo sapiens,Brain,,snDrop-seq,35289,Integrative single-cell analysis of transcriptional and epigenetic states in the human adult brain,TRUE,,collect_lake_2018.R @@ -85,4 +85,4 @@ Montoro_10x,Mus musculus,Trachea,adult,10x,7193,A revised airway epithelial hier Montoro_Smart-seq2,Mus musculus,Trachea,adult,modified Smart-seq2,301,A revised airway epithelial hierarchy includes CFTR-expressing ionocytes,TRUE,3 WT,collect_montoro_smartseq2.R Plasschaert,Mus musculus,Trachea,adult,inDrop,6977,A single-cell atlas of the airway epithelium reveals the CFTR-rich pulmonary ionocyte,TRUE,4 WT,collect_plasschaert.R Quake_10x_Trachea,Mus musculus,Trachea,,10x,11269,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_10x.R -Quake_Smart-seq2_Trachea,Mus musculus,Trachea,,Smart-seq2,1350,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_smartseq2.R +Quake_Smart-seq2_Trachea,Mus musculus,Trachea,,Smart-seq2,1350,Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris,TRUE,,collect_quake_smartseq2.R \ No newline at end of file diff --git a/Datasets/aligned_datasets.csv b/Datasets/aligned_datasets.csv index d56de6e..a449ae0 100644 --- a/Datasets/aligned_datasets.csv +++ b/Datasets/aligned_datasets.csv @@ -15,7 +15,8 @@ ALIGNED_Mus_musculus_Pancreas,Mus musculus,Pancreas,,,,,,"Baron_mouse, Quake_Sma ALIGNED_Mus_musculus_Retina,Mus musculus,Retina,,,,,,"Macosko, Shekhar" ALIGNED_Mus_musculus_Small_Intestine,Mus musculus,Small Intestine,,,,,,"Haber_10x, Haber_10x_largecell, Haber_10x_region, Haber_10x_FAE, Haber_Smart-seq2" ALIGNED_Mus_musculus_Spleen,Mus musculus,Spleen,,,,,,"Quake_10x_Spleen, Quake_Smart-seq2_Spleen" +ALIGNED_Mus_musculus_Heart_and_Aorta,Mus musculus,"Heart, Aorta",,,,,,"Quake_10x_Heart_and_Aorta, Quake_Smart-seq2_Heart" ALIGNED_Mus_musculus_Thymus,Mus musculus,Thymus,,,,,,"Quake_10x_Thymus, Quake_Smart-seq2_Thymus" ALIGNED_Mus_musculus_Tongue,Mus musculus,Tongue,,,,,,"Quake_10x_Tongue, Quake_Smart-seq2_Tongue" ALIGNED_Mus_musculus_Trachea,Mus musculus,Trachea,,,,,,"Quake_10x_Trachea, Quake_Smart-seq2_Trachea" -ALIGNED_Tabula_Muris,Mus musculus,Trachea,,,,,,"Quake_10x, Quake_Smart-seq2" +ALIGNED_Tabula_Muris,Mus musculus,Atlas,,,,,,"Quake_10x, Quake_Smart-seq2" diff --git a/Datasets/collect/collect_ariss.R b/Datasets/collect/collect_ariss.R new file mode 100644 index 0000000..1f64ff0 --- /dev/null +++ b/Datasets/collect/collect_ariss.R @@ -0,0 +1,70 @@ +#! /usr/bin/env Rscript +# by wangshuai +# 11 Mar 2019 +# 14:36:35 PM + +suppressPackageStartupMessages({ + library(Seurat) +}) +source("../../Utilities/data.R", chdir = TRUE) + +#READ label file +cat("Reading label file...\n") +metadata1 <- read.table("../download/Ariss/wt_Rbf_and_populations.txt",header=T,stringsAsFactors = F) +row.names(metadata1) <- metadata1[,'CellName'] +metadata1 <- metadata1[,c('Genotype','cell_type1')] + +metadata2 <- read.table("../download/Ariss/Cells_and_population.txt",header=T,row.names=1,stringsAsFactors = F) +metadata2$Genotype <- 'wt' + +includedcells<-union(row.names(metadata1),row.names(metadata2)) +metadata <- rbind(metadata2,metadata1) +metadata <- metadata[which(row.names(metadata) %in% includedcells),] + +celltypes <- read.csv('../download/celltypes',sep='\t') + +metadata$lifestage <- 'third instar larva stage' +metadata$organ <- 'eye disc' +metadata$race <- 'Drosophila melanogaster' + +#READ DGE +cat("Reading DGE\n") +path <- "../download/Ariss/GSE115476" +fileNames <- dir(path) +filePath <- sapply(fileNames, function(x){ + paste(path,x,sep='/')}) +data <- lapply(filePath, function(x){ + read.table(x, header=T,stringsAsFactors = F)}) + +i <- 1 +for (name in names(data)){ + perfix<-substr(name,gregexpr(pattern = '_',text = name)[[1]]+1,gregexpr(pattern = "\\.",text = name)[[1]]-1) + colnames(data[[i]]) <- lapply(colnames(data[[i]]),function(x){ + paste(perfix,x,sep='_')}) + genes <- data[[i]][,1] + included_cells <- intersect(rownames(metadata), colnames(data[[i]])) + data[[i]] <- data.frame(genes,data[[i]][, included_cells]) + i <- i+1 +} + +expmerge <- Reduce(function(x,y) merge(x,y,by=1,all=T),data) +row.names(expmerge)<-expmerge[,1] +expmerge<-expmerge[,-1] +included_cells <- intersect(rownames(metadata), colnames(expmerge)) +metadata <- metadata[included_cells, ] +expmerge <- expmerge[, included_cells] +expmerge[is.na(expmerge)]<-0 + +expressed_genes <- rownames(expmerge)[rowSums(expmerge > 1) > 5] +expmerge <- Matrix(as.matrix(expmerge),sparse = T) + +message("Constructing dataset...") +dataset <- new("ExprDataSet", + exprs = expmerge, obs = metadata, + var = data.frame(row.names = rownames(expmerge)), + uns = list(expressed_genes = expressed_genes) +) + +message("Saving data...") +write_dataset(dataset, "../data/Ariss/data.h5") +cat("Done!\n") diff --git a/Datasets/collect/collect_tusi.R b/Datasets/collect/collect_tusi.R index 25dc398..1e5fcb6 100755 --- a/Datasets/collect/collect_tusi.R +++ b/Datasets/collect/collect_tusi.R @@ -27,10 +27,15 @@ colnames(potential) <- "potential" meta_df <- Reduce(cbind, list(meta_df, fate, potential)) rownames(meta_df) <- meta_df$cell_id meta_df$cell_id <- NULL +meta_df$cell_type1 = "HSPC" expr_mat <- expr_mat[rownames(meta_df), ] +#assign cell ontology +cell_ontology <- read.csv("../cell_ontology/bone_marrow_cell_ontology.csv") +cell_ontology <- cell_ontology[, c("cell_type1", "cell_ontology_class", "cell_ontology_id")] + #datasets_meta datasets_meta <- read.csv("../ACA_datasets.csv", header = TRUE, row.names = 1) -construct_dataset("../data/Tusi", t(expr_mat), meta_df, datasets_meta, grouping = "batch") +construct_dataset("../data/Tusi", t(expr_mat), meta_df, datasets_meta, cell_ontology, grouping = "batch") message("Done!") diff --git a/doc/_static/BLAST.html b/doc/_static/BLAST.html new file mode 100644 index 0000000..d1f4a9a --- /dev/null +++ b/doc/_static/BLAST.html @@ -0,0 +1,12222 @@ + + +
+import time
+import warnings
+warnings.filterwarnings("ignore")
+
+import tensorflow as tf
+tf.logging.set_verbosity(0)
+
+import Cell_BLAST as cb
+cb.config.N_JOBS = 4
+cb.config.RANDOM_SEED = 0
+
In this tutorial, we demonstrate how to perform cell BLAST based on DIRECTi models.
+Again, we use the pancreas datasets as an example.
+ +baron_human = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Baron_human/data.h5").normalize()
+
Cell BLAST uses multiple models to increase specificity.
+Here we first train 4 DIRECTi models, each with a different random seed. Maximal epoch number is set to 50 to save time.
+ +%%capture
+models = []
+for i in range(4):
+ models.append(cb.directi.fit_DIRECTi(
+ baron_human, baron_human.uns["seurat_genes"], latent_dim=10, cat_dim=20,
+ epoch=50, patience=10, random_seed=i,
+ path="/tmp/cb/examples/baron_human_blast_models/model_%d" % i
+ ))
+
Then we build a cell BLAST "database" by feeding our previously trained models and the reference dataset. The chained method build_empirical
enables empirical p-values in downstream analyses.
blast = cb.blast.BLAST(models, baron_human).build_empirical()
+
Like DIRECTi models, BLAST
objects can be easily saved and loaded.
blast.save("./baron_human_blast")
+del blast
+blast = cb.blast.BLAST.load("./baron_human_blast")
+
We load another pancreas dataset to demonstrate the querying process.
+ +lawlor = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Lawlor/data.h5").normalize()
+
To query the database, we first use the query
method to obtain initial hits in the reference database. This is done by efficient Euclidean distance based nearest neighbor search in the latent space. Nearest neighbors in the latent space of each model will be merged. Though highly efficient, latent space Euclidean distance is not the best metric to determine cell-cell similarity. To obtain better accuracy and specificity, we also compute posterior distribution distances as well as empirical p-values for these nearest neighbors.
Then we use reconcile_models
to pool together informarion from multiple models and filter
the initial hits to obtain significant hits.
start_time = time.time()
+lawlor_hits = blast.query(lawlor).reconcile_models().filter(by="pval", cutoff=0.05)
+print("Querying time: %.3f ms/cell" % (
+ (time.time() - start_time) * 1000 / len(lawlor_hits)
+))
+
Detailed information can be checked by printing the hits object.
+ +print(lawlor_hits[0:1])
+
Finally, we can use annotate
method to obtain cell type predictions.
lawlor_predictions = lawlor_hits.annotate("cell_ontology_class")
+
Cell type prediction can be compared with author provided annotation via Sankey diagrams. We see that for the Lawlor
query, cell type predictions are very accurate.
fig = cb.blast.sankey(
+ lawlor.obs["cell_ontology_class"].values,
+ lawlor_predictions.values.ravel(),
+ title="Lawlor to Baron_human", tint_cutoff=3
+)
+
import time
+import warnings
+warnings.filterwarnings("ignore")
+
+import tensorflow as tf
+tf.logging.set_verbosity(0)
+
+import Cell_BLAST as cb
+cb.config.N_JOBS = 4
+cb.config.RANDOM_SEED = 0
+
Let's first load a dataset (Baron, M. et al. Cell Syst, 2016), which profiles >8,000 human pancreatic islet cells.
+Here we normalized the dataset so that the resulting model, once fitted, can be used to project other datasets normalized in the same way.
++ +Theoretically speaking, read count distribution may deviate from negative binomial due to the scaling, but practically it still fits very well.
+
baron_human = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Baron_human/data.h5").normalize()
+
Now we build and fit a DIRECTi model with the one-step fit_DIRECTi
function.
We set latent space dimensionality to 10, and go with 20 intrinsic clusters.
+To save time, we only train for 50 epochs. The model has not fully converged but already works well.
+ +%%capture
+model = cb.directi.fit_DIRECTi(
+ baron_human, baron_human.uns["seurat_genes"],
+ latent_dim=10, cat_dim=20, epoch=50,
+ path="./baron_human_model"
+)
+
We can project cells into the low dimensional latent space using the inference
method.
+It is recommended that you store the returned latent space into the latent
slot of the original dataset object, which facilitates visualization.
baron_human.latent = model.inference(baron_human)
+
ax = baron_human.visualize_latent("cell_ontology_class")
+
We see that different cell types can readily be distinguished.
+Note that though 20-dimensional categorical latent was used, far less clusters form in the latent space. This is because the model is flexible to discard categories or to use multiple categories to represent the same cluster if a redundant number of categories is specified.
+You can also save the model for future use. It is straightforward to load a saved model.
+ +model.save()
+model.close()
+del model
+model = cb.directi.DIRECTi.load("./baron_human_model")
+
We can also project other datasets with the same model. Here we test with the Muraro dataset (Muraro, M. et al. Cell Systems, 2016)
+There will be a warning saying that we have some genes missing in the new dataset, but it doesn't really matter. Distinct cell types are still well separated.
+ +muraro = cb.data.ExprDataSet.read_dataset("../../Datasets/data/Muraro/data.h5").normalize()
+muraro.latent = model.inference(muraro)
+ax = muraro.visualize_latent("cell_ontology_class")
+
If we train on a "meta" dataset merged from multiple datasets, we'll find significant systematical bias among them.
+ +combined_dataset = cb.data.ExprDataSet.merge_datasets({
+ "Baron_human": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Baron_human/data.h5"),
+ "Segerstolpe": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Segerstolpe/data.h5"),
+ "Muraro": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Muraro/data.h5"),
+ "Xin": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Xin_2016/data.h5"),
+ "Lawlor": cb.data.ExprDataSet.read_dataset("../../Datasets/data/Lawlor/data.h5")
+}, meta_col="study", merge_uns_slots=["seurat_genes"]).normalize()
+
%%capture
+model = cb.directi.fit_DIRECTi(
+ combined_dataset, combined_dataset.uns["seurat_genes"],
+ latent_dim=10, cat_dim=20, epoch=20,
+ path="/tmp/cb/examples/pancreas_unaligned_model"
+)
+combined_dataset.latent = model.inference(combined_dataset)
+
ax = combined_dataset.visualize_latent("study")
+
You can remove the systematical bias from the latent space simply by specifying a batch_effect
column, which is "study" in this case.
This time we train the model for 100 epoches to get better convergence.
+ +%%capture
+model_rmbatch = cb.directi.fit_DIRECTi(
+ combined_dataset, combined_dataset.uns["seurat_genes"], batch_effect="study",
+ latent_dim=10, cat_dim=20, epoch=100,
+ path="/tmp/cb/examples/pancreas_aligned_model"
+)
+combined_dataset.latent = model_rmbatch.inference(combined_dataset)
+
We see that systematical bias is largely removed. Cells of the same cell type from different studies are well aligned.
+ +ax = combined_dataset.visualize_latent("study")
+
ax = combined_dataset.visualize_latent("cell_ontology_class")
+