Introduction

This vignette showcases the main functionalities of the sketchR package, and illustrates how it can be used to generate a subsample of a dataset using the geometric sketching/subsampling algorithms and implementations proposed by Hie et al. (2019) and Song et al. (2022), as well as create a set of diagnostic plots.

Preparation

We start by loading the required packages and preparing an example data set.

suppressPackageStartupMessages({
    library(sketchR)
    library(TENxPBMCData)
    library(scuttle)
    library(scran)
    library(scater)
    library(SingleR)
    library(celldex)
    library(cowplot)
    library(SummarizedExperiment)
    library(SingleCellExperiment)
    library(beachmat.hdf5)
})

We will use the PBMC3k data set from the TENxPBMCData Bioconductor package for illustration. The chunk below prepares the data set by calculating log-transformed normalized counts, finding highly variable genes, performing dimensionality reduction and predicting cell type labels using the SingleR package.

## Load data
pbmc3k <- TENxPBMCData::TENxPBMCData(dataset = "pbmc3k")
#> see ?TENxPBMCData and browseVignettes('TENxPBMCData') for documentation
#> loading from cache

## Set row and column names
colnames(pbmc3k) <- paste0("Cell", seq_len(ncol(pbmc3k)))
rownames(pbmc3k) <- scuttle::uniquifyFeatureNames(
    ID = SummarizedExperiment::rowData(pbmc3k)$ENSEMBL_ID,
    names = SummarizedExperiment::rowData(pbmc3k)$Symbol_TENx
)

## Normalize and log-transform counts
pbmc3k <- scuttle::logNormCounts(pbmc3k)

## Find highly variable genes
dec <- scran::modelGeneVar(pbmc3k)
top.hvgs <- scran::getTopHVGs(dec, n = 2000)

## Perform dimensionality reduction
set.seed(100)
pbmc3k <- scater::runPCA(pbmc3k, subset_row = top.hvgs)
pbmc3k <- scater::runTSNE(pbmc3k, dimred = "PCA")

## Predict cell type labels
ref_monaco <- celldex::MonacoImmuneData()
#> see ?celldex and browseVignettes('celldex') for documentation
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache
#> see ?celldex and browseVignettes('celldex') for documentation
#> downloading 1 resources
#> retrieving 1 resource
#> loading from cache
pred_monaco_main <- SingleR::SingleR(test = pbmc3k, ref = ref_monaco, 
                                     labels = ref_monaco$label.main)
pbmc3k$labels_main <- pred_monaco_main$labels

dim(pbmc3k)
#> [1] 32738  2700

Subsampling

The geosketch() function performs geometric sketching by calling the geosketch python package. The output is a vector of indices that can be used to subset the full dataset. The provided seed will be propagated to the python code to achieve reproducibility.

idx800gs <- geosketch(SingleCellExperiment::reducedDim(pbmc3k, "PCA"), 
                      N = 800, seed = 123)
#> + /home/pkgbuild/.cache/R/basilisk/1.15.4/0/bin/conda 'create' '--yes' '--prefix' '/home/pkgbuild/.cache/R/basilisk/1.15.4/sketchR/0.99.0/universal' 'python=3.9.12' '--quiet' '-c' 'bioconda' '-c' 'conda-forge'
#> + /home/pkgbuild/.cache/R/basilisk/1.15.4/0/bin/conda 'install' '--yes' '--prefix' '/home/pkgbuild/.cache/R/basilisk/1.15.4/sketchR/0.99.0/universal' 'python=3.9.12'
#> + /home/pkgbuild/.cache/R/basilisk/1.15.4/0/bin/conda 'install' '--yes' '--prefix' '/home/pkgbuild/.cache/R/basilisk/1.15.4/sketchR/0.99.0/universal' '-c' 'bioconda' '-c' 'conda-forge' 'python=3.9.12' '_libgcc_mutex=0.1' '_openmp_mutex=4.5' 'anndata=0.10.5.post1' 'array-api-compat=1.4.1' 'blosc=1.21.5' 'bottleneck=1.3.7' 'brotli=1.1.0' 'brotli-bin=1.1.0' 'bzip2=1.0.8' 'c-ares=1.26.0' 'ca-certificates=2023.11.17' 'cached-property=1.5.2' 'cached_property=1.5.2' 'certifi=2023.11.17' 'colorama=0.4.6' 'contourpy=1.2.0' 'curl=8.5.0' 'cycler=0.12.1' 'dunamai=1.19.0' 'exceptiongroup=1.2.0' 'fonttools=4.47.2' 'freetype=2.12.1' 'get_version=3.5.5' 'gettext=0.21.1' 'git=2.43.0' 'h5py=3.8.0' 'hdf5=1.14.0' 'icu=73.2' 'importlib-metadata=7.0.1' 'importlib_metadata=7.0.1' 'joblib=1.1.0' 'jpeg=9e' 'keyutils=1.6.1' 'kiwisolver=1.4.5' 'krb5=1.21.2' 'lcms2=2.12' 'ld_impl_linux-64=2.40' 'legacy-api-wrap=1.2' 'lerc=3.0' 'libaec=1.1.2' 'libblas=3.9.0' 'libbrotlicommon=1.1.0' 'libbrotlidec=1.1.0' 'libbrotlienc=1.1.0' 'libcblas=3.9.0' 'libcurl=8.5.0' 'libdeflate=1.10' 'libedit=3.1.20191231' 'libev=4.33' 'libexpat=2.5.0' 'libffi=3.4.2' 'libgcc-ng=13.2.0' 'libgfortran-ng=13.2.0' 'libgfortran5=13.2.0' 'libgomp=13.2.0' 'libhwloc=2.9.3' 'libiconv=1.17' 'liblapack=3.9.0' 'libllvm14=14.0.6' 'libnghttp2=1.58.0' 'libnsl=2.0.1' 'libopenblas=0.3.25' 'libpng=1.6.42' 'libsqlite=3.44.2' 'libssh2=1.11.0' 'libstdcxx-ng=13.2.0' 'libtiff=4.3.0' 'libuuid=2.38.1' 'libwebp-base=1.3.2' 'libxcrypt=4.4.36' 'libxml2=2.12.4' 'libzlib=1.2.13' 'llvmlite=0.40.1' 'lz4-c=1.9.4' 'lzo=2.10' 'matplotlib-base=3.6.3' 'munkres=1.0.7' 'natsort=8.4.0' 'ncurses=6.4' 'networkx=3.2' 'nomkl=1.0' 'numba=0.57.1' 'numexpr=2.8.3' 'numpy=1.21.5' 'olefile=0.47' 'openjpeg=2.5.0' 'openssl=3.2.1' 'packaging=23.2' 'pandas=1.3.5' 'patsy=0.5.6' 'pcre2=10.42' 'perl=5.32.1' 'pillow=8.4.0' 'pip=23.3.2' 'pynndescent=0.5.11' 'pyparsing=3.1.1' 'pytables=3.7.0' 'python=3.9.12' 'python-dateutil=2.8.2' 'python_abi=3.9' 'pytz=2023.4' 'readline=8.2' 'scanpy=1.7.2' 'scikit-learn=1.0.2' 'scipy=1.7.3' 'seaborn=0.13.2' 'seaborn-base=0.13.2' 'setuptools=69.0.3' 'setuptools-scm=8.0.4' 'setuptools_scm=8.0.4' 'sinfo=0.3.1' 'six=1.16.0' 'snappy=1.1.10' 'sqlite=3.44.2' 'statsmodels=0.13.5' 'stdlib-list=0.10.0' 'tbb=2021.11.0' 'threadpoolctl=3.0.0' 'tk=8.6.13' 'tomli=2.0.1' 'tqdm=4.66.1' 'typing-extensions=4.9.0' 'typing_extensions=4.9.0' 'tzdata=2023d' 'umap-learn=0.5.5' 'unicodedata2=15.1.0' 'wheel=0.42.0' 'xz=5.2.6' 'zipp=3.17.0' 'zlib=1.2.13' 'zstd=1.5.5'
head(idx800gs)
#> [1]  5  6  7 10 11 15
length(idx800gs)
#> [1] 800

Similarly, the scsampler() function calls the scSampler python package to perform subsampling.

idx800scs <- scsampler(SingleCellExperiment::reducedDim(pbmc3k, "PCA"), 
                       N = 800, seed = 123)
head(idx800scs)
#> [1] 1079  644 1494 1278    5  391
length(idx800scs)
#> [1] 800

To illustrate the result of the subsampling, we plot the tSNE representation of the original data as well as the two subsets (using the tSNE coordinates derived from the full dataset).

cowplot::plot_grid(
    scater::plotTSNE(pbmc3k, colour_by = "labels_main"),
    scater::plotTSNE(pbmc3k[, idx800gs], colour_by = "labels_main"),
    scater::plotTSNE(pbmc3k[, idx800scs], colour_by = "labels_main")
)

We can also illustrate the relative abundance of each cell type in the full data and in the subsets, respectively.

compareCompositionPlot(SummarizedExperiment::colData(pbmc3k), 
                       idx = list(geosketch = idx800gs,
                                  scSampler = idx800scs), 
                       column = "labels_main")

Diagnostic plots

sketchR provides a convenient function to plot the Hausdorff distance between the full dataset and the subsample, for a range of sketch sizes (to make this plot reproducible, we use set.seed before the call).

set.seed(123)
hausdorffDistPlot(mat = SingleCellExperiment::reducedDim(pbmc3k, "PCA"), 
                  Nvec = c(400, 800, 2000),
                  Nrep = 3, methods = c("geosketch", "scsampler", "uniform"))

Session info

sessionInfo()
#> R Under development (unstable) (2024-01-16 r85808)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 22.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.19-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] beachmat.hdf5_1.1.0         cowplot_1.1.3              
#>  [3] celldex_1.13.0              SingleR_2.5.1              
#>  [5] scater_1.31.2               ggplot2_3.4.4              
#>  [7] scran_1.31.2                scuttle_1.13.0             
#>  [9] TENxPBMCData_1.21.0         HDF5Array_1.31.5           
#> [11] rhdf5_2.47.2                DelayedArray_0.29.1        
#> [13] SparseArray_1.3.3           S4Arrays_1.3.3             
#> [15] abind_1.4-5                 Matrix_1.6-5               
#> [17] SingleCellExperiment_1.25.0 SummarizedExperiment_1.33.3
#> [19] Biobase_2.63.0              GenomicRanges_1.55.2       
#> [21] GenomeInfoDb_1.39.5         IRanges_2.37.1             
#> [23] S4Vectors_0.41.3            BiocGenerics_0.49.1        
#> [25] MatrixGenerics_1.15.0       matrixStats_1.2.0          
#> [27] sketchR_0.99.0              BiocStyle_2.31.0           
#> 
#> loaded via a namespace (and not attached):
#>   [1] DBI_1.2.1                 bitops_1.0-7             
#>   [3] gridExtra_2.3             rlang_1.1.3              
#>   [5] magrittr_2.0.3            compiler_4.4.0           
#>   [7] RSQLite_2.3.5             dir.expiry_1.11.0        
#>   [9] DelayedMatrixStats_1.25.1 png_0.1-8                
#>  [11] vctrs_0.6.5               pkgconfig_2.0.3          
#>  [13] crayon_1.5.2              fastmap_1.1.1            
#>  [15] dbplyr_2.4.0              XVector_0.43.1           
#>  [17] labeling_0.4.3            utf8_1.2.4               
#>  [19] rmarkdown_2.25            ggbeeswarm_0.7.2         
#>  [21] purrr_1.0.2               bit_4.0.5                
#>  [23] xfun_0.41                 bluster_1.13.0           
#>  [25] zlibbioc_1.49.0           cachem_1.0.8             
#>  [27] beachmat_2.19.1           jsonlite_1.8.8           
#>  [29] blob_1.2.4                highr_0.10               
#>  [31] rhdf5filters_1.15.2       Rhdf5lib_1.25.1          
#>  [33] BiocParallel_1.37.0       irlba_2.3.5.1            
#>  [35] parallel_4.4.0            cluster_2.1.6            
#>  [37] R6_2.5.1                  bslib_0.6.1              
#>  [39] limma_3.59.2              reticulate_1.35.0        
#>  [41] jquerylib_0.1.4           Rcpp_1.0.12              
#>  [43] knitr_1.45                igraph_2.0.1.1           
#>  [45] tidyselect_1.2.0          viridis_0.6.5            
#>  [47] yaml_2.3.8                codetools_0.2-19         
#>  [49] curl_5.2.0                lattice_0.22-5           
#>  [51] tibble_3.2.1              withr_3.0.0              
#>  [53] basilisk.utils_1.15.1     KEGGREST_1.43.0          
#>  [55] Rtsne_0.17                evaluate_0.23            
#>  [57] BiocFileCache_2.11.1      ExperimentHub_2.11.1     
#>  [59] Biostrings_2.71.2         pillar_1.9.0             
#>  [61] BiocManager_1.30.22       filelock_1.0.3           
#>  [63] generics_0.1.3            RCurl_1.98-1.14          
#>  [65] BiocVersion_3.19.1        sparseMatrixStats_1.15.0 
#>  [67] munsell_0.5.0             scales_1.3.0             
#>  [69] glue_1.7.0                metapod_1.11.1           
#>  [71] tools_4.4.0               AnnotationHub_3.11.1     
#>  [73] BiocNeighbors_1.21.2      ScaledMatrix_1.11.0      
#>  [75] locfit_1.5-9.8            grid_4.4.0               
#>  [77] AnnotationDbi_1.65.2      edgeR_4.1.15             
#>  [79] colorspace_2.1-0          GenomeInfoDbData_1.2.11  
#>  [81] basilisk_1.15.4           beeswarm_0.4.0           
#>  [83] BiocSingular_1.19.0       vipor_0.4.7              
#>  [85] cli_3.6.2                 rsvd_1.0.5               
#>  [87] rappdirs_0.3.3            fansi_1.0.6              
#>  [89] viridisLite_0.4.2         dplyr_1.1.4              
#>  [91] gtable_0.3.4              sass_0.4.8               
#>  [93] digest_0.6.34             ggrepel_0.9.5            
#>  [95] dqrng_0.3.2               farver_2.1.1             
#>  [97] memoise_2.0.1             htmltools_0.5.7          
#>  [99] lifecycle_1.0.4           httr_1.4.7               
#> [101] mime_0.12                 statmod_1.5.0            
#> [103] bit64_4.0.5

References

Hie, Brian, Hyunghoon Cho, Benjamin DeMeo, Bryan Bryson, and Bonnie Berger. 2019. “Geometric Sketching Compactly Summarizes the Single-Cell Transcriptomic Landscape.” Cell Syst 8 (6): 483–493.e7.

Song, Dongyuan, Nan Miles Xi, Jingyi Jessica Li, and Lin Wang. 2022. “ScSampler: Fast Diversity-Preserving Subsampling of Large-Scale Single-Cell Transcriptomic Data.” bioRxiv, 2022.01.15.476407.