This vignette introduces the PRECAST workflow for the analysis of four spatial transcriptomics datasets. The workflow consists of three steps

  • Independent preprocessing and model setting
  • Probabilistic embedding and clustering using PRECAST model
  • Downstream analysis (i.e. visualization of clusters and embeddings)

We demonstrate the use of PRECAST to four human dorsolateral prefrontal cortex Visium data. Download the data to R

suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(SingleCellExperiment))
name_ID4 <- as.character(c(151673, 151674, 151675, 151676))

### Read data in an online manner
n_ID <- length(name_ID4)
url_brainA <- "https://github.com/feiyoung/DR-SC.Analysis/raw/main/data/DLPFC_data/"
url_brainB <- ".rds"
seuList <- list()
if (!require(ProFAST)) {
    remotes::install_github("feiyoung/ProFAST")
}
for (i in 1:n_ID) {
    # i <- 1
    cat("input brain data", i, "\n")
    # load and read data
    dlpfc <- readRDS(url(paste0(url_brainA, name_ID4[i], url_brainB)))
    count <- dlpfc@assays@data$counts
    row.names(count) <- ProFAST::transferGeneNames(row.names(count), species = "Human")
    seu1 <- CreateSeuratObject(counts = count, meta.data = as.data.frame(colData(dlpfc)), min.cells = 10,
        min.features = 10)
    seuList[[i]] <- seu1
}
# saveRDS(seuList, file='seuList4.RDS')

The package can be loaded with the command:

Compare PRECAST with DR-SC in analyzing one sample

First, we view the the spatial transcriptomics data with Visium platform.

seuList <- readRDS("seuList4.RDS")
seuList  ## a list including  Seurat objects
#> [[1]]
#> An object of class Seurat 
#> 16578 features across 3639 samples within 1 assay 
#> Active assay: RNA (16578 features, 0 variable features)
#> 
#> [[2]]
#> An object of class Seurat 
#> 17332 features across 3673 samples within 1 assay 
#> Active assay: RNA (17332 features, 0 variable features)
#> 
#> [[3]]
#> An object of class Seurat 
#> 16062 features across 3592 samples within 1 assay 
#> Active assay: RNA (16062 features, 0 variable features)
#> 
#> [[4]]
#> An object of class Seurat 
#> 16097 features across 3460 samples within 1 assay 
#> Active assay: RNA (16097 features, 0 variable features)

check the meta data that must include the spatial coordinates named “row” and “col”, respectively. If the names are not, they are required to rename them.

metadataList <- lapply(seuList, function(x) x@meta.data)

for (r in seq_along(metadataList)) {
    meta_data <- metadataList[[r]]
    cat(all(c("row", "col") %in% colnames(meta_data)), "\n")  ## the names are correct!

}
#> TRUE 
#> TRUE 
#> TRUE 
#> TRUE

Prepare the PRECASTObject.

Create a PRECASTObj object to prepare for PRECAST models. Here, we only show the HVGs method to select the 2000 highly variable genes, but users are able to choose more genes or also use the SPARK-X to choose the spatially variable genes.

set.seed(2023)
preobj <- CreatePRECASTObject(seuList = seuList, selectGenesMethod = "HVGs", gene.number = 2000)  # 

Add the model setting

## check the number of genes/features after filtering step
preobj@seulist
#> [[1]]
#> An object of class Seurat 
#> 2000 features across 3639 samples within 1 assay 
#> Active assay: RNA (2000 features, 1304 variable features)
#> 
#> [[2]]
#> An object of class Seurat 
#> 2000 features across 3673 samples within 1 assay 
#> Active assay: RNA (2000 features, 1296 variable features)
#> 
#> [[3]]
#> An object of class Seurat 
#> 2000 features across 3592 samples within 1 assay 
#> Active assay: RNA (2000 features, 1363 variable features)
#> 
#> [[4]]
#> An object of class Seurat 
#> 2000 features across 3460 samples within 1 assay 
#> Active assay: RNA (2000 features, 1311 variable features)
## Add adjacency matrix list for a PRECASTObj object to prepare for PRECAST model fitting.
PRECASTObj <- AddAdjList(preobj, platform = "Visium")
## Add a model setting in advance for a PRECASTObj object. verbose =TRUE helps outputing the
## information in the algorithm.
PRECASTObj <- AddParSetting(PRECASTObj, Sigma_equal = TRUE, coreNum = 1, maxIter = 30, verbose = TRUE)

Fit PRECAST

For function PRECAST, users can specify the number of clusters \(K\) or set K to be an integer vector by using modified BIC(MBIC) to determine \(K\). Here, we use user-specified number of clusters.

### Given K
PRECASTObj <- PRECAST(PRECASTObj, K = 7)
#> fitting ...
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================================================| 100%
#> variable initialize finish! 
#> predict Y and V! 
#> diff Energy = 5.352642 
#> diff Energy = 3.869659 
#> Finish ICM step! 
#> iter = 2, loglik= -1751519.375000, dloglik=0.999184 
#> predict Y and V! 
#> diff Energy = 0.199198 
#> diff Energy = 18.637697 
#> diff Energy = 12.282670 
#> Finish ICM step! 
#> iter = 3, loglik= -1687532.375000, dloglik=0.036532 
#> predict Y and V! 
#> diff Energy = 34.223406 
#> diff Energy = 23.602364 
#> diff Energy = 7.225336 
#> diff Energy = 17.366884 
#> Finish ICM step! 
#> iter = 4, loglik= -1662926.500000, dloglik=0.014581 
#> predict Y and V! 
#> diff Energy = 9.217014 
#> diff Energy = 36.505916 
#> diff Energy = 12.640654 
#> diff Energy = 7.013317 
#> Finish ICM step! 
#> iter = 5, loglik= -1651196.250000, dloglik=0.007054 
#> predict Y and V! 
#> diff Energy = 11.597027 
#> diff Energy = 27.970960 
#> diff Energy = 27.865365 
#> diff Energy = 16.542714 
#> Finish ICM step! 
#> iter = 6, loglik= -1644952.125000, dloglik=0.003782 
#> predict Y and V! 
#> diff Energy = 14.099258 
#> diff Energy = 6.265841 
#> diff Energy = 16.304311 
#> diff Energy = 12.900834 
#> Finish ICM step! 
#> iter = 7, loglik= -1641321.000000, dloglik=0.002207 
#> predict Y and V! 
#> diff Energy = 14.007410 
#> diff Energy = 17.600648 
#> diff Energy = 25.950155 
#> diff Energy = 19.812621 
#> Finish ICM step! 
#> iter = 8, loglik= -1639139.875000, dloglik=0.001329 
#> predict Y and V! 
#> diff Energy = 10.483860 
#> diff Energy = 18.725734 
#> diff Energy = 9.710191 
#> diff Energy = 13.309923 
#> Finish ICM step! 
#> iter = 9, loglik= -1637770.375000, dloglik=0.000835 
#> predict Y and V! 
#> diff Energy = 11.523672 
#> diff Energy = 15.130851 
#> diff Energy = 6.538038 
#> diff Energy = 0.074181 
#> Finish ICM step! 
#> iter = 10, loglik= -1636871.000000, dloglik=0.000549 
#> predict Y and V! 
#> diff Energy = 8.364143 
#> diff Energy = 21.631868 
#> diff Energy = 6.242281 
#> diff Energy = 6.729873 
#> Finish ICM step! 
#> iter = 11, loglik= -1636283.250000, dloglik=0.000359 
#> predict Y and V! 
#> diff Energy = 0.927025 
#> diff Energy = 15.350188 
#> diff Energy = 11.847601 
#> diff Energy = 3.148637 
#> Finish ICM step! 
#> iter = 12, loglik= -1635864.250000, dloglik=0.000256 
#> predict Y and V! 
#> diff Energy = 1.586123 
#> diff Energy = 15.749150 
#> diff Energy = 0.509640 
#> diff Energy = 3.230909 
#> Finish ICM step! 
#> iter = 13, loglik= -1635580.375000, dloglik=0.000174 
#> predict Y and V! 
#> diff Energy = 3.843540 
#> diff Energy = 16.159437 
#> diff Energy = 1.552089 
#> diff Energy = 9.823270 
#> Finish ICM step! 
#> iter = 14, loglik= -1635368.500000, dloglik=0.000130 
#> predict Y and V! 
#> diff Energy = 4.408342 
#> diff Energy = 5.844825 
#> diff Energy = 0.815869 
#> diff Energy = 6.127618 
#> Finish ICM step! 
#> iter = 15, loglik= -1635197.375000, dloglik=0.000105 
#> predict Y and V! 
#> diff Energy = 0.780338 
#> diff Energy = 8.164158 
#> diff Energy = 6.340349 
#> diff Energy = 3.846594 
#> Finish ICM step! 
#> iter = 16, loglik= -1635056.875000, dloglik=0.000086 
#> predict Y and V! 
#> diff Energy = 2.431712 
#> diff Energy = 7.563790 
#> diff Energy = 2.636661 
#> diff Energy = 4.795888 
#> Finish ICM step! 
#> iter = 17, loglik= -1634961.250000, dloglik=0.000058 
#> predict Y and V! 
#> diff Energy = 2.976474 
#> diff Energy = 6.393884 
#> diff Energy = 7.891301 
#> diff Energy = 3.367600 
#> Finish ICM step! 
#> iter = 18, loglik= -1634876.875000, dloglik=0.000052 
#> predict Y and V! 
#> diff Energy = 2.636386 
#> diff Energy = 5.855440 
#> diff Energy = 0.534479 
#> diff Energy = 2.063849 
#> Finish ICM step! 
#> iter = 19, loglik= -1634818.750000, dloglik=0.000036 
#> predict Y and V! 
#> diff Energy = 3.368599 
#> diff Energy = 5.959762 
#> diff Energy = 10.145140 
#> diff Energy = 4.577272 
#> Finish ICM step! 
#> iter = 20, loglik= -1634758.625000, dloglik=0.000037 
#> predict Y and V! 
#> diff Energy = 3.075098 
#> diff Energy = 6.025854 
#> diff Energy = 2.577874 
#> diff Energy = 4.456743 
#> Finish ICM step! 
#> iter = 21, loglik= -1634704.750000, dloglik=0.000033 
#> predict Y and V! 
#> diff Energy = 0.851936 
#> diff Energy = 3.680762 
#> diff Energy = 5.000007 
#> diff Energy = 4.430635 
#> Finish ICM step! 
#> iter = 22, loglik= -1634682.625000, dloglik=0.000014 
#> predict Y and V! 
#> diff Energy = 3.930192 
#> diff Energy = 4.606272 
#> diff Energy = 3.037536 
#> diff Energy = 1.915371 
#> Finish ICM step! 
#> iter = 23, loglik= -1634640.000000, dloglik=0.000026 
#> predict Y and V! 
#> diff Energy = 1.289436 
#> diff Energy = 4.483134 
#> diff Energy = 4.826944 
#> diff Energy = 4.437447 
#> Finish ICM step! 
#> iter = 24, loglik= -1634615.125000, dloglik=0.000015 
#> predict Y and V! 
#> diff Energy = 1.368741 
#> diff Energy = 1.629112 
#> diff Energy = 3.685341 
#> diff Energy = 4.444629 
#> Finish ICM step! 
#> iter = 25, loglik= -1634594.125000, dloglik=0.000013 
#> predict Y and V! 
#> diff Energy = 1.994114 
#> diff Energy = 2.870984 
#> diff Energy = 1.030904 
#> diff Energy = 4.451361 
#> Finish ICM step! 
#> iter = 26, loglik= -1634549.125000, dloglik=0.000028 
#> predict Y and V! 
#> diff Energy = 0.478093 
#> diff Energy = 3.148266 
#> diff Energy = 4.119249 
#> diff Energy = 7.650194 
#> Finish ICM step!

Use the function SelectModel() to re-organize the fitted results in PRECASTObj.

## backup the fitting results in resList
resList <- PRECASTObj@resList
PRECASTObj <- SelectModel(PRECASTObj)
ari_precast <- sapply(1:length(seuList), function(r) mclust::adjustedRandIndex(PRECASTObj@resList$cluster[[r]],
    PRECASTObj@seulist[[r]]$layer_guess_reordered))
mat <- matrix(round(ari_precast, 2), nrow = 1)
name_ID4 <- as.character(c(151673, 151674, 151675, 151676))
colnames(mat) <- name_ID4
DT::datatable(mat)

Other options Users are also able to set multiple K, then choose the best one

PRECASTObj2 <- AddParSetting(PRECASTObj, Sigma_equal = FALSE, coreNum = 4, maxIter = 30, verbose = TRUE)  # set 4 cores to run in parallel.
PRECASTObj2 <- PRECAST(PRECASTObj2, K = 5:8)
## backup the fitting results in resList
resList2 <- PRECASTObj2@resList
PRECASTObj2 <- SelectModel(PRECASTObj2)

Besides, user can also use different initialization method by setting int.model, for example, set int.model=NULL; see the functions AddParSetting() and model_set() for more details.

Integration and Visualization

Integrate the all samples using the IntegrateSpaData function. For computational efficiency, this function exclusively integrates the variable genes. Specifically, in cases where users do not specify the PRECASTObj@seuList or seuList argument within the IntegrateSpaData function, it automatically focuses on integrating only the variable genes. The default setting for PRECASTObj@seuList is NULL when rawData.preserve in CreatePRECASTObject is set to FALSE. For instance:

print(PRECASTObj@seuList)
#> NULL
seuInt <- IntegrateSpaData(PRECASTObj, species = "Human")
seuInt
#> An object of class Seurat 
#> 2000 features across 14364 samples within 1 assay 
#> Active assay: PRE_CAST (2000 features, 0 variable features)
#>  2 dimensional reductions calculated: PRECAST, position
## The low-dimensional embeddings obtained by PRECAST are saved in PRECAST reduction slot.

Integrating all genes There are two ways to use IntegrateSpaData integrating all genes, which will require more memory. We recommand running for all genes on server. The first one is to set value for PRECASTObj@seuList.

## assign the raw Seurat list object to it.  For illustration, we generate a new seuList with
## more genes; For integrating all genes, users can set `seuList <- bc2`.
genes <- c(row.names(PRECASTObj@seulist[[1]]), row.names(seuList[[1]])[1:10])
seuList_sub <- lapply(seuList, function(x) x[genes, ])
PRECASTObj@seuList <- seuList_sub  # 
seuInt <- IntegrateSpaData(PRECASTObj, species = "Human")
seuInt

The second method is to set a value for the argument seuList:

PRECASTObj@seuList <- NULL
## At the same time, we can set subsampling to speed up the computation.
seuInt <- IntegrateSpaData(PRECASTObj, species = "Human", seuList = seuList_sub, subsample_rate = 0.5)
seuInt

First, user can choose a beautiful color schema using chooseColors().

cols_cluster <- chooseColors(palettes_name = "Classic 20", n_colors = 7, plot_colors = TRUE)

Show the spatial scatter plot for clusters


p12 <- SpaPlot(seuInt, item = "cluster", batch = NULL, point_size = 1, cols = cols_cluster, combine = TRUE,
    nrow.legend = 7)
p12


# users can plot each sample by setting combine=FALSE

Users can re-plot the above figures for specific need by returning a ggplot list object. For example, we plot the spatial heatmap using a common legend.

library(ggplot2)
pList <- SpaPlot(seuInt, item = "cluster", batch = NULL, point_size = 2.5, cols = cols_cluster,
    combine = FALSE, nrow.legend = 7)
pList <- lapply(pList, function(x) x + coord_flip() + scale_x_reverse())
drawFigs(pList, layout.dim = c(2, 2), common.legend = TRUE, legend.position = "right", align = "hv")

Show the spatial UMAP/tNSE RGB plot to illustrate the performance in extracting features.

seuInt <- AddUMAP(seuInt)
p13List <- SpaPlot(seuInt, batch = NULL, item = "RGB_UMAP", point_size = 2, combine = FALSE, text_size = 15)
p13List <- lapply(p13List, function(x) x + coord_flip() + scale_x_reverse())
drawFigs(p13List, layout.dim = c(2, 2), common.legend = TRUE, legend.position = "right", align = "hv")

# seuInt <- AddTSNE(seuInt) SpaPlot(seuInt, batch=NULL,item='RGB_TSNE',point_size=2,
# combine=T, text_size=15)

Show the tSNE plot based on the extracted features from PRECAST to check the performance of integration.

seuInt <- AddTSNE(seuInt, n_comp = 2)
p1 <- dimPlot(seuInt, item = "cluster", point_size = 0.5, font_family = "serif", cols = cols_cluster,
    border_col = "gray10", nrow.legend = 14, legend_pos = "right")  # Times New Roman
p2 <- dimPlot(seuInt, item = "batch", point_size = 0.5, font_family = "serif", legend_pos = "right")

drawFigs(list(p1, p2), layout.dim = c(1, 2), legend.position = "right", align = "hv")

Combined differential expression analysis

library(Seurat)
dat_deg <- FindAllMarkers(seuInt)
library(dplyr)
n <- 5
dat_deg %>%
    group_by(cluster) %>%
    top_n(n = n, wt = avg_log2FC) -> top10

Plot DE genes’ heatmap for each spatial domain identified by PRECAST.

library(ggplot2)
## HeatMap
p1 <- DotPlot(seuInt, features = unique(top10$gene), col.min = 0, col.max = 1) + theme(axis.text.x = element_text(angle = 45,
    hjust = 1, size = 8))
p1

Session Info

sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 22621)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=Chinese (Simplified)_China.936 
#> [2] LC_CTYPE=Chinese (Simplified)_China.936   
#> [3] LC_MONETARY=Chinese (Simplified)_China.936
#> [4] LC_NUMERIC=C                              
#> [5] LC_TIME=Chinese (Simplified)_China.936    
#> 
#> attached base packages:
#> [1] parallel  stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#> [1] dplyr_1.0.9        sp_1.5-0           SeuratObject_4.1.0 Seurat_4.1.1      
#> [5] ggplot2_3.4.1      PRECAST_1.6.3      gtools_3.9.2.2    
#> 
#> loaded via a namespace (and not attached):
#>   [1] utf8_1.2.3                  reticulate_1.25            
#>   [3] tidyselect_1.1.2            htmlwidgets_1.5.4          
#>   [5] grid_4.1.2                  BiocParallel_1.28.3        
#>   [7] Rtsne_0.16                  munsell_0.5.0              
#>   [9] ScaledMatrix_1.2.0          codetools_0.2-18           
#>  [11] ragg_1.2.2                  ica_1.0-2                  
#>  [13] DT_0.21                     future_1.26.1              
#>  [15] miniUI_0.1.1.1              withr_2.5.0                
#>  [17] spatstat.random_2.2-0       colorspace_2.1-0           
#>  [19] progressr_0.10.1            Biobase_2.54.0             
#>  [21] highr_0.9                   knitr_1.37                 
#>  [23] rstudioapi_0.13             stats4_4.1.2               
#>  [25] SingleCellExperiment_1.16.0 ROCR_1.0-11                
#>  [27] ggsignif_0.6.3              tensor_1.5                 
#>  [29] listenv_0.8.0               labeling_0.4.2             
#>  [31] MatrixGenerics_1.6.0        GenomeInfoDbData_1.2.7     
#>  [33] polyclip_1.10-0             farver_2.1.1               
#>  [35] rprojroot_2.0.3             parallelly_1.32.0          
#>  [37] vctrs_0.6.1                 generics_0.1.2             
#>  [39] xfun_0.29                   ggthemes_4.2.4             
#>  [41] R6_2.5.1                    GenomeInfoDb_1.30.1        
#>  [43] ggbeeswarm_0.6.0            rsvd_1.0.5                 
#>  [45] bitops_1.0-7                spatstat.utils_3.0-1       
#>  [47] cachem_1.0.6                DelayedArray_0.20.0        
#>  [49] assertthat_0.2.1            promises_1.2.0.1           
#>  [51] scales_1.2.1                rgeos_0.5-9                
#>  [53] beeswarm_0.4.0              gtable_0.3.3               
#>  [55] beachmat_2.10.0             globals_0.15.0             
#>  [57] goftest_1.2-3               rlang_1.1.0                
#>  [59] systemfonts_1.0.4           splines_4.1.2              
#>  [61] rstatix_0.7.0               lazyeval_0.2.2             
#>  [63] broom_0.7.12                spatstat.geom_2.4-0        
#>  [65] yaml_2.3.6                  reshape2_1.4.4             
#>  [67] abind_1.4-5                 crosstalk_1.2.0            
#>  [69] backports_1.4.1             httpuv_1.6.5               
#>  [71] tools_4.1.2                 ellipsis_0.3.2             
#>  [73] spatstat.core_2.4-4         jquerylib_0.1.4            
#>  [75] RColorBrewer_1.1-3          BiocGenerics_0.40.0        
#>  [77] ggridges_0.5.3              Rcpp_1.0.10                
#>  [79] plyr_1.8.7                  sparseMatrixStats_1.6.0    
#>  [81] zlibbioc_1.40.0             purrr_0.3.4                
#>  [83] RCurl_1.98-1.6              ggpubr_0.4.0               
#>  [85] rpart_4.1.16                deldir_1.0-6               
#>  [87] viridis_0.6.2               pbapply_1.5-0              
#>  [89] cowplot_1.1.1               S4Vectors_0.32.3           
#>  [91] zoo_1.8-10                  SummarizedExperiment_1.24.0
#>  [93] ggrepel_0.9.1               cluster_2.1.2              
#>  [95] fs_1.5.2                    magrittr_2.0.3             
#>  [97] GiRaF_1.0.1                 RSpectra_0.16-1            
#>  [99] data.table_1.14.2           scattermore_0.8            
#> [101] lmtest_0.9-40               RANN_2.6.1                 
#> [103] fitdistrplus_1.1-8          matrixStats_0.62.0         
#> [105] patchwork_1.1.1             mime_0.12                  
#> [107] evaluate_0.15               xtable_1.8-4               
#> [109] mclust_5.4.10               IRanges_2.28.0             
#> [111] gridExtra_2.3               compiler_4.1.2             
#> [113] scater_1.25.1               tibble_3.2.1               
#> [115] KernSmooth_2.23-20          crayon_1.5.1               
#> [117] htmltools_0.5.2             mgcv_1.8-39                
#> [119] later_1.3.0                 tidyr_1.2.0                
#> [121] DBI_1.1.2                   formatR_1.11               
#> [123] MASS_7.3-55                 car_3.0-12                 
#> [125] Matrix_1.4-0                cli_3.2.0                  
#> [127] igraph_1.3.5                DR.SC_3.4                  
#> [129] GenomicRanges_1.46.1        pkgconfig_2.0.3            
#> [131] pkgdown_2.0.6               plotly_4.10.0              
#> [133] scuttle_1.4.0               spatstat.sparse_2.1-1      
#> [135] vipor_0.4.5                 bslib_0.3.1                
#> [137] XVector_0.34.0              CompQuadForm_1.4.3         
#> [139] stringr_1.4.0               digest_0.6.29              
#> [141] sctransform_0.3.3           RcppAnnoy_0.0.19           
#> [143] spatstat.data_3.0-0         rmarkdown_2.11             
#> [145] leiden_0.4.2                uwot_0.1.11                
#> [147] DelayedMatrixStats_1.16.0   shiny_1.7.1                
#> [149] lifecycle_1.0.3             nlme_3.1-155               
#> [151] jsonlite_1.8.0              carData_3.0-5              
#> [153] BiocNeighbors_1.12.0        limma_3.50.1               
#> [155] desc_1.4.0                  viridisLite_0.4.1          
#> [157] fansi_1.0.4                 pillar_1.9.0               
#> [159] lattice_0.20-45             fastmap_1.1.0              
#> [161] httr_1.4.3                  survival_3.2-13            
#> [163] glue_1.6.2                  png_0.1-7                  
#> [165] stringi_1.7.6               sass_0.4.1                 
#> [167] textshaping_0.3.6           BiocSingular_1.10.0        
#> [169] memoise_2.0.1               irlba_2.3.5                
#> [171] future.apply_1.9.0