Perform dreamlet analysis of single cell data assaying T cell populations in donors infected with tuberculosis.

Loading libraries

library(SingleCellExperiment)
library(zellkonverter)
library(ggplot2)
library(scattermore)
library(ggtree)
library(crumblr)
library(aplot)
library(tidyverse)
library(dreamlet)
library(kableExtra)
library(ggcorrplot)
library(RColorBrewer)
library(DelayedArray)

Load data

# read H5AD file
path = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/Nathan_NatImm_2021/"
file = paste0(path, "/Nathan_NatImm_2021.h5ad")
sce = readH5AD(file, use_hdf5=TRUE)   
counts(sce) = assay(sce, "X")

# get only single-cell RNA-seq
# rest is CITE-seq proteins
only_rna = grep("prot", rownames(sce), invert=TRUE)

# create variable that distinguishes cells from the same donor
# but different batches
sce$donor_batch = with(colData(sce), paste(donor, batch))
sce$batch = factor(sce$batch)
sce$season = factor(sce$season)
sce$TB_status = factor(sce$TB_status, c("CONTROL", "CASE"))

reducedDim(sce, "UMAP") = with(colData(sce), data.frame(UMAP_1, UMAP_2))

# sort cell clusters by name
sce = sce[only_rna,sce$cluster_name != "NA"]
colData(sce) = droplevels(colData(sce))
tab = colData(sce) %>%
  as_tibble %>%
  group_by(cluster_ids, cluster_name) %>%
  filter(row_number()==1) %>%
  summarise(cluster_ids, cluster_name) %>%
  arrange(as.numeric(gsub("C-", '', cluster_ids)))

sce$cluster_name = factor(sce$cluster_name, unique(tab$cluster_name))
sce$cluster_ids = factor(sce$cluster_ids, unique(tab$cluster_ids))

Plot UMAP

colData(sce) %>%  
  data.frame %>%
  ggplot(aes(UMAP_1, UMAP_2, color=cluster_name)) +
    geom_scattermore() +
    theme_classic() +
    theme(aspect.ratio=1)

Create pseudobulk

pb <- aggregateToPseudoBulk(sce,
    assay = "counts",     
    cluster_id = "cluster_name",  
    sample_id = "donor_batch")

Plots of cell fractions

fracs = cellCounts(pb) / rowSums(cellCounts(pb))

df = data.frame(fracs, TB_status = pb$TB_status, check.names=FALSE)
df = reshape2::melt(df, id.vars="TB_status")

ggplot(df, aes(TB_status, value, fill=variable)) + 
  geom_violin() +
  geom_boxplot(fill="grey50", width=.1) + 
  facet_wrap(~ variable) +
  theme_classic() +
  theme(aspect.ratio=1, legend.position="none") +
  ylab("Fraction")

Evaluate crumblr and build tree

# run crumblr on cell counts
cobj = crumblr(cellCounts(pb))

# build tree from pseudobulk expression
hcl = buildClusterTreeFromPB( pb )

Variance partitioning analysis

Scale age to avoid numerical issues.

Removing repeated measures

Keep only a single sample per donor

form = ~ as.numeric(TB_status) + (1|batch) + (1|season) + (1|sex) + scale(age) + I(scale(age)^2) + scale(prop_NAT)

keep = !duplicated(colData(pb)$donor)
vp = fitExtractVarPartModel(cobj[,keep], form, colData(pb)[keep,])

# Rename columns and sum 2 age components
vp$age = vp[,'scale(age)'] + vp[,'I(scale(age)^2)']
vp[,'scale(age)']  = c()
vp[,'I(scale(age)^2)'] = c()
vp[,'TB_status'] = vp[,'as.numeric(TB_status)']
vp[,'as.numeric(TB_status)'] = c()
vp[,'Ancestry'] = vp[,'scale(prop_NAT)']
vp[,'scale(prop_NAT)'] = c()

cols = c(brewer.pal(ncol(vp)-1, "Set1"), "grey85")

plotVarPart(sortCols(vp), col=cols)

fig.vp = plotPercentBars(sortCols(vp), col=cols) 
fig.vp

Differential testing

# Run precision-weighted linear mixed model using dream()
# and compute empirical Bayes moderated t-statistics
form = ~ TB_status + (1|batch) + (1|season) + (1|sex) + scale(age) + I(scale(age)^2) + scale(prop_NAT)
fit = dream(cobj[,keep], form, colData(pb)[keep,])
fit = eBayes(fit) 

# show results
topTable(fit, coef='TB_statusCASE', number=Inf) %>%   
  select(logFC, AveExpr, t, P.Value, adj.P.Val) %>% 
  kbl() %>%  
  kable_classic(full_width = FALSE)
logFC AveExpr t P.Value adj.P.Val
CD4+ Th17 -0.2257359 0.4235041 -4.7370817 0.0000037 0.0001084
CD4+ Th17/1 -0.1437905 1.3118369 -3.7573419 0.0002210 0.0032051
CD4+ CD161+ Th1 -0.1729482 0.4253747 -3.1423799 0.0018696 0.0180723
CD4+ Th1 -0.1127846 1.2720219 -2.7894947 0.0056725 0.0376635
CD4+ CD27+CD161+ -0.1352943 1.3537376 -2.7455767 0.0064937 0.0376635
CD4/8+ PD-1+TIGIT+ 0.1530373 -0.3545267 2.5151422 0.0125014 0.0604234
CD4+ cytotoxic 0.2476681 0.3599038 2.4280652 0.0158939 0.0658462
CD8+ GZMK+ 0.1511185 0.1385331 2.0603913 0.0403559 0.1462902
CD4+ central -0.0785180 0.9132650 -1.9409260 0.0534599 0.1722596
CD4+ RORC+ Treg -0.0923229 -1.1832085 -1.7996958 0.0730948 0.1972883
CD8+ GZMB+ 0.1926524 0.0288309 1.7891270 0.0748335 0.1972883
CD4+ CD161+ cytotoxic 0.1763841 -0.6023561 1.6587939 0.0985623 0.2381922
CD4+ CD27+ -0.0622216 1.3197772 -1.3817057 0.1682514 0.3590821
CD4+ CCR5+ cytotoxic 0.1008632 -1.1284844 1.3514941 0.1777094 0.3590821
Vd2 0.1187868 -0.5075829 1.3190773 0.1884217 0.3590821
CD4+ Treg -0.0742707 0.7746323 -1.2908919 0.1981143 0.3590821
CD4+ lncRNA -0.0575043 0.6657067 -1.2135632 0.2261575 0.3857982
CD8+ central 0.0791749 -0.6547533 1.0919502 0.2758852 0.4444818
CD8+ CXCR3+ 0.0603235 -0.9614326 0.9505680 0.3427493 0.5231437
CD4+ CCR4+ICOS+ central -0.0435270 0.9873410 -0.8930360 0.3727905 0.5405462
CD4+ CCR4+ -0.0369053 0.5048199 -0.7776485 0.4374817 0.6041414
CD4+ activated 0.0440065 -0.4235253 0.7040916 0.4820850 0.6267070
CD8+ activated 0.0542842 -1.6675538 0.6801943 0.4970435 0.6267070
CD4+ CD38+ICOS+ central -0.0274512 -0.8408021 -0.5619769 0.5746432 0.6811781
Vd1 0.0484634 -3.4950052 0.5436273 0.5872225 0.6811781
CD4+ HLA-DR+ 0.0217049 -0.5522151 0.3499343 0.7269274 0.8108037
CD4+ CCR4+ central -0.0158614 0.9945189 -0.2849111 0.7759731 0.8334526
CD4+ CD161+ Th2 -0.0005680 0.2044739 -0.0070329 0.9943954 0.9956035
CD4+ Th2 -0.0003209 0.6931680 -0.0055163 0.9956035 0.9956035

Multivariate test along hierarchy

# Perform multivariate testing 
res1 = treeTest( fit, cobj, hcl, coef="TB_statusCASE")

fig1 = plotTreeTestBeta(res1)
fig2 = crumblr::plotForest(res1, hide=TRUE)
fig3 = plotPercentBars( sortCols(vp), col=cols)
  
fig2 %>% insert_left(fig1) %>% insert_right(fig3)

Plot of one cell type

# get CLR values from crumblr, but don't regularize weights
cobj = crumblr(cellCounts(pb[,keep]), max.ratio=Inf)

library(ggbeeswarm)
library(ggrepel)

CT = "CD4+ Th17"
df = data.frame(CLR = cobj$E[CT,],
                TB_status = pb$TB_status[keep],
                se = 1/sqrt(cobj$weights[CT,]),
                counts = cellCounts(pb[,keep])[,CT],
                totalCells = rowSums(cellCounts(pb[,keep])),
                colData(pb[,keep]))

# points to highlight
df_count1 = df %>%
  filter(counts > 0) %>%
  arrange(-se) %>% 
  head(2) %>%
  select(CLR, TB_status, se, counts, totalCells)

df_count2 = df %>%
  arrange(se) %>% 
  head(2) %>%
  select(CLR, TB_status, se, counts, totalCells)

df_count = rbind(df_count1, df_count2)


n.skip = sum(scale(df$se) > 3)
zmax = sort(df$se, decreasing=TRUE)[n.skip+1]

df %>%
  arrange(se) %>%
  ggplot(aes(TB_status, CLR, color=pmin(zmax, se))) +
    geom_boxplot(width=.3) +
    geom_beeswarm(cex = 2, size=3) + 
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5), aspect.ratio=1, axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    ggtitle(CT) +
    scale_color_gradient2(low="grey", high="red", limits= c(0,zmax*1.01), name = "Standard error") +
    geom_text_repel(data=df_count, aes(TB_status, CLR, label=paste(counts, totalCells, sep=' / ')), color="black", box.padding=2)

Session info

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.4 (Blue Onyx)
## 
## Matrix products: default
## BLAS:   /hpc/packages/minerva-rocky9/R/4.4.1/lib64/R/lib/libRblas.so 
## LAPACK: /hpc/packages/minerva-rocky9/R/4.4.1/lib64/R/lib/libRlapack.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] ggrepel_0.9.6               ggbeeswarm_0.7.2           
##  [3] DelayedArray_0.32.0         SparseArray_1.6.0          
##  [5] S4Arrays_1.6.0              abind_1.4-8                
##  [7] Matrix_1.7-1                RColorBrewer_1.1-3         
##  [9] ggcorrplot_0.1.4.1          kableExtra_1.4.0           
## [11] dreamlet_1.4.1              variancePartition_1.36.3   
## [13] BiocParallel_1.40.0         limma_3.62.1               
## [15] lubridate_1.9.3             forcats_1.0.0              
## [17] stringr_1.5.1               dplyr_1.1.4                
## [19] purrr_1.0.2                 readr_2.1.5                
## [21] tidyr_1.3.1                 tibble_3.2.1               
## [23] tidyverse_2.0.0             aplot_0.2.4                
## [25] crumblr_0.99.11             ggtree_3.10.1              
## [27] scattermore_1.2             ggplot2_3.5.1              
## [29] zellkonverter_1.16.0        SingleCellExperiment_1.28.1
## [31] SummarizedExperiment_1.36.0 Biobase_2.66.0             
## [33] GenomicRanges_1.58.0        GenomeInfoDb_1.42.1        
## [35] IRanges_2.40.0              S4Vectors_0.44.0           
## [37] BiocGenerics_0.52.0         MatrixGenerics_1.18.0      
## [39] matrixStats_1.4.1          
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.5                  bitops_1.0-9             
##   [3] httr_1.4.7                Rgraphviz_2.50.0         
##   [5] numDeriv_2016.8-1.1       tools_4.4.1              
##   [7] backports_1.5.0           R6_2.5.1                 
##   [9] metafor_4.6-0             HDF5Array_1.34.0         
##  [11] lazyeval_0.2.2            rhdf5filters_1.18.0      
##  [13] withr_3.0.2               prettyunits_1.2.0        
##  [15] gridExtra_2.3             cli_3.6.3                
##  [17] labeling_0.4.3            sass_0.4.9               
##  [19] KEGGgraph_1.66.0          SQUAREM_2021.1           
##  [21] mvtnorm_1.3-2             proxy_0.4-27             
##  [23] mixsqp_0.3-54             systemfonts_1.1.0        
##  [25] yulab.utils_0.1.8         svglite_2.1.3            
##  [27] zenith_1.8.0              invgamma_1.1             
##  [29] rstudioapi_0.17.1         RSQLite_2.3.8            
##  [31] generics_0.1.3            gridGraphics_0.5-1       
##  [33] gtools_3.9.5              metadat_1.2-0            
##  [35] lifecycle_1.0.4           yaml_2.3.10              
##  [37] edgeR_4.4.1               mathjaxr_1.6-0           
##  [39] pamr_1.57                 rhdf5_2.50.0             
##  [41] gplots_3.2.0              grid_4.4.1               
##  [43] blob_1.2.4                crayon_1.5.3             
##  [45] dir.expiry_1.14.0         lattice_0.22-6           
##  [47] msigdbr_7.5.1             annotate_1.84.0          
##  [49] KEGGREST_1.46.0           pillar_1.10.0            
##  [51] knitr_1.49                tcltk_4.4.1              
##  [53] boot_1.3-31               corpcor_1.6.10           
##  [55] codetools_0.2-20          glue_1.8.0               
##  [57] ggfun_0.1.8               data.table_1.16.4        
##  [59] vctrs_0.6.5               png_0.1-8                
##  [61] treeio_1.26.0             Rdpack_2.6.2             
##  [63] gtable_0.3.6              assertthat_0.2.1         
##  [65] cachem_1.1.0              xfun_0.49                
##  [67] rbibutils_2.3             Rfast_2.1.0              
##  [69] survival_3.7-0            iterators_1.0.14         
##  [71] statmod_1.5.0             nlme_3.1-166             
##  [73] tsne_0.1-3.1              pbkrtest_0.5.3           
##  [75] bit64_4.5.2               progress_1.2.3           
##  [77] EnvStats_3.0.0            filelock_1.0.3           
##  [79] bslib_0.8.0               irlba_2.3.5.1            
##  [81] vipor_0.4.7               KernSmooth_2.23-24       
##  [83] colorspace_2.1-1          rmeta_3.0                
##  [85] DBI_1.2.3                 tidyselect_1.2.1         
##  [87] bit_4.5.0                 compiler_4.4.1           
##  [89] graph_1.84.0              basilisk.utils_1.18.0    
##  [91] xml2_1.3.6                tcltk2_1.2-11            
##  [93] scales_1.3.0              caTools_1.18.3           
##  [95] remaCor_0.0.18            digest_0.6.37            
##  [97] minqa_1.2.8               rmarkdown_2.29           
##  [99] basilisk_1.18.0           aod_1.3.3                
## [101] XVector_0.46.0            RhpcBLASctl_0.23-42      
## [103] htmltools_0.5.8.1         pkgconfig_2.0.3          
## [105] lme4_1.1-35.5             sparseMatrixStats_1.18.0 
## [107] mashr_0.2.79              fastmap_1.2.0            
## [109] rlang_1.1.4               UCSC.utils_1.2.0         
## [111] DelayedMatrixStats_1.28.0 farver_2.1.2             
## [113] jquerylib_0.1.4           jsonlite_1.8.9           
## [115] RCurl_1.98-1.16           magrittr_2.0.3           
## [117] GenomeInfoDbData_1.2.13   ggplotify_0.1.2          
## [119] patchwork_1.3.0           Rhdf5lib_1.28.0          
## [121] munsell_0.5.1             Rcpp_1.0.13-1            
## [123] ape_5.8-1                 babelgene_22.9           
## [125] viridis_0.6.5             reticulate_1.40.0        
## [127] EnrichmentBrowser_2.36.0  RcppZiggurat_0.1.6       
## [129] stringi_1.8.4             zlibbioc_1.52.0          
## [131] MASS_7.3-61               plyr_1.8.9               
## [133] parallel_4.4.1            Biostrings_2.74.0        
## [135] splines_4.4.1             hms_1.1.3                
## [137] locfit_1.5-9.10           reshape2_1.4.4           
## [139] XML_3.99-0.17             evaluate_1.0.1           
## [141] RcppParallel_5.1.9        nloptr_2.1.1             
## [143] tzdb_0.4.0                ashr_2.2-63              
## [145] broom_1.0.7               xtable_1.8-4             
## [147] fANCOVA_0.6-1             e1071_1.7-16             
## [149] tidytree_0.4.6            viridisLite_0.4.2        
## [151] class_7.3-22              truncnorm_1.0-9          
## [153] stylo_0.7.5               lmerTest_3.1-3           
## [155] memoise_2.0.1             beeswarm_0.4.0           
## [157] AnnotationDbi_1.68.0      cluster_2.1.6            
## [159] timechange_0.3.0          GSEABase_1.68.0