Perform dreamlet analysis of single cell data assaying T cell populations in donors infected with tuberculosis.

Loading libraries

library(SingleCellExperiment)
library(zellkonverter)
library(ggplot2)
library(scattermore)
library(ggtree)
library(crumblr)
library(aplot)
library(tidyverse)
library(dreamlet)
library(kableExtra)
library(ggcorrplot)
library(RColorBrewer)
library(DelayedArray)

Load data

# read H5AD file
path = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/Nathan_NatImm_2021/"
file = paste0(path, "/Nathan_NatImm_2021.h5ad")
sce = readH5AD(file, use_hdf5=TRUE)   
counts(sce) = assay(sce, "X")

# get only single-cell RNA-seq
# rest is CITE-seq proteins
only_rna = grep("prot", rownames(sce), invert=TRUE)

# create variable that distinguishes cells from the same donor
# but different batches
sce$donor_batch = with(colData(sce), paste(donor, batch))
sce$batch = factor(sce$batch)
sce$season = factor(sce$season)
sce$TB_status = factor(sce$TB_status, c("CONTROL", "CASE"))

reducedDim(sce, "UMAP") = with(colData(sce), data.frame(UMAP_1, UMAP_2))

# sort cell clusters by name
sce = sce[only_rna,sce$cluster_name != "NA"]
colData(sce) = droplevels(colData(sce))
tab = colData(sce) %>%
  as_tibble %>%
  group_by(cluster_ids, cluster_name) %>%
  filter(row_number()==1) %>%
  summarise(cluster_ids, cluster_name) %>%
  arrange(as.numeric(gsub("C-", '', cluster_ids)))

sce$cluster_name = factor(sce$cluster_name, unique(tab$cluster_name))
sce$cluster_ids = factor(sce$cluster_ids, unique(tab$cluster_ids))

Plot UMAP

colData(sce) %>%  
  data.frame %>%
  ggplot(aes(UMAP_1, UMAP_2, color=cluster_name)) +
    geom_scattermore() +
    theme_classic() +
    theme(aspect.ratio=1)

Create pseudobulk

pb <- aggregateToPseudoBulk(sce,
    assay = "counts",     
    cluster_id = "cluster_name",  
    sample_id = "donor_batch")

Plots of cell fractions

fracs = cellCounts(pb) / rowSums(cellCounts(pb))

df = data.frame(fracs, TB_status = pb$TB_status, check.names=FALSE)
df = reshape2::melt(df, id.vars="TB_status")

ggplot(df, aes(TB_status, value, fill=variable)) + 
  geom_violin() +
  geom_boxplot(fill="grey50", width=.1) + 
  facet_wrap(~ variable) +
  theme_classic() +
  theme(aspect.ratio=1, legend.position="none") +
  ylab("Fraction")

Evaluate crumblr and build tree

# run crumblr on cell counts
cobj = crumblr(cellCounts(pb))

# build tree from pseudobulk expression
hcl = buildClusterTreeFromPB( pb )

Variance partitioning analysis

Scale age to avoid numerical issues.

Removing repeated measures

Keep only a single sample per donor

form = ~ as.numeric(TB_status) + (1|batch) + (1|season) + (1|sex) + scale(age) + I(scale(age)^2) + scale(prop_NAT)

keep = !duplicated(colData(pb)$donor)
vp = fitExtractVarPartModel(cobj[,keep], form, colData(pb)[keep,])

# Rename columns and sum 2 age components
vp$age = vp[,'scale(age)'] + vp[,'I(scale(age)^2)']
vp[,'scale(age)']  = c()
vp[,'I(scale(age)^2)'] = c()
vp[,'TB_status'] = vp[,'as.numeric(TB_status)']
vp[,'as.numeric(TB_status)'] = c()
vp[,'Ancestry'] = vp[,'scale(prop_NAT)']
vp[,'scale(prop_NAT)'] = c()

cols = c(brewer.pal(ncol(vp)-1, "Set1"), "grey85")

plotVarPart(sortCols(vp), col=cols)

fig.vp = plotPercentBars(sortCols(vp), col=cols) 
fig.vp

Differential testing

# Run precision-weighted linear mixed model using dream()
# and compute empirical Bayes moderated t-statistics
form = ~ TB_status + (1|batch) + (1|season) + (1|sex) + scale(age) + I(scale(age)^2) + scale(prop_NAT)
fit = dream(cobj[,keep], form, colData(pb)[keep,])
fit = eBayes(fit) 

# show results
topTable(fit, coef='TB_statusCASE', number=Inf) %>%   
  select(logFC, AveExpr, t, P.Value, adj.P.Val) %>% 
  kbl() %>%  
  kable_classic(full_width = FALSE)

	logFC	AveExpr	t	P.Value	adj.P.Val
CD4+ Th17	-0.2257359	0.4235041	-4.7370817	0.0000037	0.0001084
CD4+ Th17/1	-0.1437905	1.3118369	-3.7573419	0.0002210	0.0032051
CD4+ CD161+ Th1	-0.1729482	0.4253747	-3.1423799	0.0018696	0.0180723
CD4+ Th1	-0.1127846	1.2720219	-2.7894947	0.0056725	0.0376635
CD4+ CD27+CD161+	-0.1352943	1.3537376	-2.7455767	0.0064937	0.0376635
CD4/8+ PD-1+TIGIT+	0.1530373	-0.3545267	2.5151422	0.0125014	0.0604234
CD4+ cytotoxic	0.2476681	0.3599038	2.4280652	0.0158939	0.0658462
CD8+ GZMK+	0.1511185	0.1385331	2.0603913	0.0403559	0.1462902
CD4+ central	-0.0785180	0.9132650	-1.9409260	0.0534599	0.1722596
CD4+ RORC+ Treg	-0.0923229	-1.1832085	-1.7996958	0.0730948	0.1972883
CD8+ GZMB+	0.1926524	0.0288309	1.7891270	0.0748335	0.1972883
CD4+ CD161+ cytotoxic	0.1763841	-0.6023561	1.6587939	0.0985623	0.2381922
CD4+ CD27+	-0.0622216	1.3197772	-1.3817057	0.1682514	0.3590821
CD4+ CCR5+ cytotoxic	0.1008632	-1.1284844	1.3514941	0.1777094	0.3590821
Vd2	0.1187868	-0.5075829	1.3190773	0.1884217	0.3590821
CD4+ Treg	-0.0742707	0.7746323	-1.2908919	0.1981143	0.3590821
CD4+ lncRNA	-0.0575043	0.6657067	-1.2135632	0.2261575	0.3857982
CD8+ central	0.0791749	-0.6547533	1.0919502	0.2758852	0.4444818
CD8+ CXCR3+	0.0603235	-0.9614326	0.9505680	0.3427493	0.5231437
CD4+ CCR4+ICOS+ central	-0.0435270	0.9873410	-0.8930360	0.3727905	0.5405462
CD4+ CCR4+	-0.0369053	0.5048199	-0.7776485	0.4374817	0.6041414
CD4+ activated	0.0440065	-0.4235253	0.7040916	0.4820850	0.6267070
CD8+ activated	0.0542842	-1.6675538	0.6801943	0.4970435	0.6267070
CD4+ CD38+ICOS+ central	-0.0274512	-0.8408021	-0.5619769	0.5746432	0.6811781
Vd1	0.0484634	-3.4950052	0.5436273	0.5872225	0.6811781
CD4+ HLA-DR+	0.0217049	-0.5522151	0.3499343	0.7269274	0.8108037
CD4+ CCR4+ central	-0.0158614	0.9945189	-0.2849111	0.7759731	0.8334526
CD4+ CD161+ Th2	-0.0005680	0.2044739	-0.0070329	0.9943954	0.9956035
CD4+ Th2	-0.0003209	0.6931680	-0.0055163	0.9956035	0.9956035

Multivariate test along hierarchy

# Perform multivariate testing 
res1 = treeTest( fit, cobj, hcl, coef="TB_statusCASE")

fig1 = plotTreeTestBeta(res1)
fig2 = crumblr::plotForest(res1, hide=TRUE)
fig3 = plotPercentBars( sortCols(vp), col=cols)
  
fig2 %>% insert_left(fig1) %>% insert_right(fig3)

Plot of one cell type

# get CLR values from crumblr, but don't regularize weights
cobj = crumblr(cellCounts(pb[,keep]), max.ratio=Inf)

library(ggbeeswarm)
library(ggrepel)

CT = "CD4+ Th17"
df = data.frame(CLR = cobj$E[CT,],
                TB_status = pb$TB_status[keep],
                se = 1/sqrt(cobj$weights[CT,]),
                counts = cellCounts(pb[,keep])[,CT],
                totalCells = rowSums(cellCounts(pb[,keep])),
                colData(pb[,keep]))

# points to highlight
df_count1 = df %>%
  filter(counts > 0) %>%
  arrange(-se) %>% 
  head(2) %>%
  select(CLR, TB_status, se, counts, totalCells)

df_count2 = df %>%
  arrange(se) %>% 
  head(2) %>%
  select(CLR, TB_status, se, counts, totalCells)

df_count = rbind(df_count1, df_count2)


n.skip = sum(scale(df$se) > 3)
zmax = sort(df$se, decreasing=TRUE)[n.skip+1]

df %>%
  arrange(se) %>%
  ggplot(aes(TB_status, CLR, color=pmin(zmax, se))) +
    geom_boxplot(width=.3) +
    geom_beeswarm(cex = 2, size=3) + 
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5), aspect.ratio=1, axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
    ggtitle(CT) +
    scale_color_gradient2(low="grey", high="red", limits= c(0,zmax*1.01), name = "Standard error") +
    geom_text_repel(data=df_count, aes(TB_status, CLR, label=paste(counts, totalCells, sep=' / ')), color="black", box.padding=2)

Session info

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.4 (Blue Onyx)
## 
## Matrix products: default
## BLAS:   /hpc/packages/minerva-rocky9/R/4.4.1/lib64/R/lib/libRblas.so 
## LAPACK: /hpc/packages/minerva-rocky9/R/4.4.1/lib64/R/lib/libRlapack.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] ggrepel_0.9.6               ggbeeswarm_0.7.2           
##  [3] DelayedArray_0.32.0         SparseArray_1.6.0          
##  [5] S4Arrays_1.6.0              abind_1.4-8                
##  [7] Matrix_1.7-1                RColorBrewer_1.1-3         
##  [9] ggcorrplot_0.1.4.1          kableExtra_1.4.0           
## [11] dreamlet_1.4.1              variancePartition_1.36.3   
## [13] BiocParallel_1.40.0         limma_3.62.1               
## [15] lubridate_1.9.3             forcats_1.0.0              
## [17] stringr_1.5.1               dplyr_1.1.4                
## [19] purrr_1.0.2                 readr_2.1.5                
## [21] tidyr_1.3.1                 tibble_3.2.1               
## [23] tidyverse_2.0.0             aplot_0.2.4                
## [25] crumblr_0.99.11             ggtree_3.10.1              
## [27] scattermore_1.2             ggplot2_3.5.1              
## [29] zellkonverter_1.16.0        SingleCellExperiment_1.28.1
## [31] SummarizedExperiment_1.36.0 Biobase_2.66.0             
## [33] GenomicRanges_1.58.0        GenomeInfoDb_1.42.1        
## [35] IRanges_2.40.0              S4Vectors_0.44.0           
## [37] BiocGenerics_0.52.0         MatrixGenerics_1.18.0      
## [39] matrixStats_1.4.1          
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.5                  bitops_1.0-9             
##   [3] httr_1.4.7                Rgraphviz_2.50.0         
##   [5] numDeriv_2016.8-1.1       tools_4.4.1              
##   [7] backports_1.5.0           R6_2.5.1                 
##   [9] metafor_4.6-0             HDF5Array_1.34.0         
##  [11] lazyeval_0.2.2            rhdf5filters_1.18.0      
##  [13] withr_3.0.2               prettyunits_1.2.0        
##  [15] gridExtra_2.3             cli_3.6.3                
##  [17] labeling_0.4.3            sass_0.4.9               
##  [19] KEGGgraph_1.66.0          SQUAREM_2021.1           
##  [21] mvtnorm_1.3-2             proxy_0.4-27             
##  [23] mixsqp_0.3-54             systemfonts_1.1.0        
##  [25] yulab.utils_0.1.8         svglite_2.1.3            
##  [27] zenith_1.8.0              invgamma_1.1             
##  [29] rstudioapi_0.17.1         RSQLite_2.3.8            
##  [31] generics_0.1.3            gridGraphics_0.5-1       
##  [33] gtools_3.9.5              metadat_1.2-0            
##  [35] lifecycle_1.0.4           yaml_2.3.10              
##  [37] edgeR_4.4.1               mathjaxr_1.6-0           
##  [39] pamr_1.57                 rhdf5_2.50.0             
##  [41] gplots_3.2.0              grid_4.4.1               
##  [43] blob_1.2.4                crayon_1.5.3             
##  [45] dir.expiry_1.14.0         lattice_0.22-6           
##  [47] msigdbr_7.5.1             annotate_1.84.0          
##  [49] KEGGREST_1.46.0           pillar_1.10.0            
##  [51] knitr_1.49                tcltk_4.4.1              
##  [53] boot_1.3-31               corpcor_1.6.10           
##  [55] codetools_0.2-20          glue_1.8.0               
##  [57] ggfun_0.1.8               data.table_1.16.4        
##  [59] vctrs_0.6.5               png_0.1-8                
##  [61] treeio_1.26.0             Rdpack_2.6.2             
##  [63] gtable_0.3.6              assertthat_0.2.1         
##  [65] cachem_1.1.0              xfun_0.49                
##  [67] rbibutils_2.3             Rfast_2.1.0              
##  [69] survival_3.7-0            iterators_1.0.14         
##  [71] statmod_1.5.0             nlme_3.1-166             
##  [73] tsne_0.1-3.1              pbkrtest_0.5.3           
##  [75] bit64_4.5.2               progress_1.2.3           
##  [77] EnvStats_3.0.0            filelock_1.0.3           
##  [79] bslib_0.8.0               irlba_2.3.5.1            
##  [81] vipor_0.4.7               KernSmooth_2.23-24       
##  [83] colorspace_2.1-1          rmeta_3.0                
##  [85] DBI_1.2.3                 tidyselect_1.2.1         
##  [87] bit_4.5.0                 compiler_4.4.1           
##  [89] graph_1.84.0              basilisk.utils_1.18.0    
##  [91] xml2_1.3.6                tcltk2_1.2-11            
##  [93] scales_1.3.0              caTools_1.18.3           
##  [95] remaCor_0.0.18            digest_0.6.37            
##  [97] minqa_1.2.8               rmarkdown_2.29           
##  [99] basilisk_1.18.0           aod_1.3.3                
## [101] XVector_0.46.0            RhpcBLASctl_0.23-42      
## [103] htmltools_0.5.8.1         pkgconfig_2.0.3          
## [105] lme4_1.1-35.5             sparseMatrixStats_1.18.0 
## [107] mashr_0.2.79              fastmap_1.2.0            
## [109] rlang_1.1.4               UCSC.utils_1.2.0         
## [111] DelayedMatrixStats_1.28.0 farver_2.1.2             
## [113] jquerylib_0.1.4           jsonlite_1.8.9           
## [115] RCurl_1.98-1.16           magrittr_2.0.3           
## [117] GenomeInfoDbData_1.2.13   ggplotify_0.1.2          
## [119] patchwork_1.3.0           Rhdf5lib_1.28.0          
## [121] munsell_0.5.1             Rcpp_1.0.13-1            
## [123] ape_5.8-1                 babelgene_22.9           
## [125] viridis_0.6.5             reticulate_1.40.0        
## [127] EnrichmentBrowser_2.36.0  RcppZiggurat_0.1.6       
## [129] stringi_1.8.4             zlibbioc_1.52.0          
## [131] MASS_7.3-61               plyr_1.8.9               
## [133] parallel_4.4.1            Biostrings_2.74.0        
## [135] splines_4.4.1             hms_1.1.3                
## [137] locfit_1.5-9.10           reshape2_1.4.4           
## [139] XML_3.99-0.17             evaluate_1.0.1           
## [141] RcppParallel_5.1.9        nloptr_2.1.1             
## [143] tzdb_0.4.0                ashr_2.2-63              
## [145] broom_1.0.7               xtable_1.8-4             
## [147] fANCOVA_0.6-1             e1071_1.7-16             
## [149] tidytree_0.4.6            viridisLite_0.4.2        
## [151] class_7.3-22              truncnorm_1.0-9          
## [153] stylo_0.7.5               lmerTest_3.1-3           
## [155] memoise_2.0.1             beeswarm_0.4.0           
## [157] AnnotationDbi_1.68.0      cluster_2.1.6            
## [159] timechange_0.3.0          GSEABase_1.68.0

Multimodally profiling memory T cells from a tuberculosis cohort identifies cell state associations with demographics, environment and disease (GSE158769)

Nathan, et al. Nat Immunol (2021)

Developed by Gabriel Hoffman

Run on 2025-01-02 18:34:50.285185

Loading libraries

Load data

Plot UMAP

Create pseudobulk

Plots of cell fractions

Evaluate crumblr and build tree

Variance partitioning analysis

Removing repeated measures

Differential testing

Multivariate test along hierarchy

Plot of one cell type

Session info