Perform dreamlet analysis of single cell data assaying T cell populations in donors infected with tuberculosis.
library(SingleCellExperiment)
library(zellkonverter)
library(ggplot2)
library(scattermore)
library(ggtree)
library(crumblr)
library(aplot)
library(tidyverse)
library(dreamlet)
library(kableExtra)
library(ggcorrplot)
library(RColorBrewer)
library(DelayedArray)
# read H5AD file
path = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/Nathan_NatImm_2021/"
file = paste0(path, "/Nathan_NatImm_2021.h5ad")
sce = readH5AD(file, use_hdf5=TRUE)
counts(sce) = assay(sce, "X")
# get only single-cell RNA-seq
# rest is CITE-seq proteins
only_rna = grep("prot", rownames(sce), invert=TRUE)
# create variable that distinguishes cells from the same donor
# but different batches
sce$donor_batch = with(colData(sce), paste(donor, batch))
sce$batch = factor(sce$batch)
sce$season = factor(sce$season)
sce$TB_status = factor(sce$TB_status, c("CONTROL", "CASE"))
reducedDim(sce, "UMAP") = with(colData(sce), data.frame(UMAP_1, UMAP_2))
# sort cell clusters by name
sce = sce[only_rna,sce$cluster_name != "NA"]
colData(sce) = droplevels(colData(sce))
tab = colData(sce) %>%
as_tibble %>%
group_by(cluster_ids, cluster_name) %>%
filter(row_number()==1) %>%
summarise(cluster_ids, cluster_name) %>%
arrange(as.numeric(gsub("C-", '', cluster_ids)))
sce$cluster_name = factor(sce$cluster_name, unique(tab$cluster_name))
sce$cluster_ids = factor(sce$cluster_ids, unique(tab$cluster_ids))
colData(sce) %>%
data.frame %>%
ggplot(aes(UMAP_1, UMAP_2, color=cluster_name)) +
geom_scattermore() +
theme_classic() +
theme(aspect.ratio=1)
pb <- aggregateToPseudoBulk(sce,
assay = "counts",
cluster_id = "cluster_name",
sample_id = "donor_batch")
fracs = cellCounts(pb) / rowSums(cellCounts(pb))
df = data.frame(fracs, TB_status = pb$TB_status, check.names=FALSE)
df = reshape2::melt(df, id.vars="TB_status")
ggplot(df, aes(TB_status, value, fill=variable)) +
geom_violin() +
geom_boxplot(fill="grey50", width=.1) +
facet_wrap(~ variable) +
theme_classic() +
theme(aspect.ratio=1, legend.position="none") +
ylab("Fraction")
# run crumblr on cell counts
cobj = crumblr(cellCounts(pb))
# build tree from pseudobulk expression
hcl = buildClusterTreeFromPB( pb )
Scale age
to avoid numerical issues.
Keep only a single sample per donor
form = ~ as.numeric(TB_status) + (1|batch) + (1|season) + (1|sex) + scale(age) + I(scale(age)^2) + scale(prop_NAT)
keep = !duplicated(colData(pb)$donor)
vp = fitExtractVarPartModel(cobj[,keep], form, colData(pb)[keep,])
# Rename columns and sum 2 age components
vp$age = vp[,'scale(age)'] + vp[,'I(scale(age)^2)']
vp[,'scale(age)'] = c()
vp[,'I(scale(age)^2)'] = c()
vp[,'TB_status'] = vp[,'as.numeric(TB_status)']
vp[,'as.numeric(TB_status)'] = c()
vp[,'Ancestry'] = vp[,'scale(prop_NAT)']
vp[,'scale(prop_NAT)'] = c()
cols = c(brewer.pal(ncol(vp)-1, "Set1"), "grey85")
plotVarPart(sortCols(vp), col=cols)
fig.vp = plotPercentBars(sortCols(vp), col=cols)
fig.vp
# Run precision-weighted linear mixed model using dream()
# and compute empirical Bayes moderated t-statistics
form = ~ TB_status + (1|batch) + (1|season) + (1|sex) + scale(age) + I(scale(age)^2) + scale(prop_NAT)
fit = dream(cobj[,keep], form, colData(pb)[keep,])
fit = eBayes(fit)
# show results
topTable(fit, coef='TB_statusCASE', number=Inf) %>%
select(logFC, AveExpr, t, P.Value, adj.P.Val) %>%
kbl() %>%
kable_classic(full_width = FALSE)
logFC | AveExpr | t | P.Value | adj.P.Val | |
---|---|---|---|---|---|
CD4+ Th17 | -0.2257359 | 0.4235041 | -4.7370817 | 0.0000037 | 0.0001084 |
CD4+ Th17/1 | -0.1437905 | 1.3118369 | -3.7573419 | 0.0002210 | 0.0032051 |
CD4+ CD161+ Th1 | -0.1729482 | 0.4253747 | -3.1423799 | 0.0018696 | 0.0180723 |
CD4+ Th1 | -0.1127846 | 1.2720219 | -2.7894947 | 0.0056725 | 0.0376635 |
CD4+ CD27+CD161+ | -0.1352943 | 1.3537376 | -2.7455767 | 0.0064937 | 0.0376635 |
CD4/8+ PD-1+TIGIT+ | 0.1530373 | -0.3545267 | 2.5151422 | 0.0125014 | 0.0604234 |
CD4+ cytotoxic | 0.2476681 | 0.3599038 | 2.4280652 | 0.0158939 | 0.0658462 |
CD8+ GZMK+ | 0.1511185 | 0.1385331 | 2.0603913 | 0.0403559 | 0.1462902 |
CD4+ central | -0.0785180 | 0.9132650 | -1.9409260 | 0.0534599 | 0.1722596 |
CD4+ RORC+ Treg | -0.0923229 | -1.1832085 | -1.7996958 | 0.0730948 | 0.1972883 |
CD8+ GZMB+ | 0.1926524 | 0.0288309 | 1.7891270 | 0.0748335 | 0.1972883 |
CD4+ CD161+ cytotoxic | 0.1763841 | -0.6023561 | 1.6587939 | 0.0985623 | 0.2381922 |
CD4+ CD27+ | -0.0622216 | 1.3197772 | -1.3817057 | 0.1682514 | 0.3590821 |
CD4+ CCR5+ cytotoxic | 0.1008632 | -1.1284844 | 1.3514941 | 0.1777094 | 0.3590821 |
Vd2 | 0.1187868 | -0.5075829 | 1.3190773 | 0.1884217 | 0.3590821 |
CD4+ Treg | -0.0742707 | 0.7746323 | -1.2908919 | 0.1981143 | 0.3590821 |
CD4+ lncRNA | -0.0575043 | 0.6657067 | -1.2135632 | 0.2261575 | 0.3857982 |
CD8+ central | 0.0791749 | -0.6547533 | 1.0919502 | 0.2758852 | 0.4444818 |
CD8+ CXCR3+ | 0.0603235 | -0.9614326 | 0.9505680 | 0.3427493 | 0.5231437 |
CD4+ CCR4+ICOS+ central | -0.0435270 | 0.9873410 | -0.8930360 | 0.3727905 | 0.5405462 |
CD4+ CCR4+ | -0.0369053 | 0.5048199 | -0.7776485 | 0.4374817 | 0.6041414 |
CD4+ activated | 0.0440065 | -0.4235253 | 0.7040916 | 0.4820850 | 0.6267070 |
CD8+ activated | 0.0542842 | -1.6675538 | 0.6801943 | 0.4970435 | 0.6267070 |
CD4+ CD38+ICOS+ central | -0.0274512 | -0.8408021 | -0.5619769 | 0.5746432 | 0.6811781 |
Vd1 | 0.0484634 | -3.4950052 | 0.5436273 | 0.5872225 | 0.6811781 |
CD4+ HLA-DR+ | 0.0217049 | -0.5522151 | 0.3499343 | 0.7269274 | 0.8108037 |
CD4+ CCR4+ central | -0.0158614 | 0.9945189 | -0.2849111 | 0.7759731 | 0.8334526 |
CD4+ CD161+ Th2 | -0.0005680 | 0.2044739 | -0.0070329 | 0.9943954 | 0.9956035 |
CD4+ Th2 | -0.0003209 | 0.6931680 | -0.0055163 | 0.9956035 | 0.9956035 |
# Perform multivariate testing
res1 = treeTest( fit, cobj, hcl, coef="TB_statusCASE")
fig1 = plotTreeTestBeta(res1)
fig2 = crumblr::plotForest(res1, hide=TRUE)
fig3 = plotPercentBars( sortCols(vp), col=cols)
fig2 %>% insert_left(fig1) %>% insert_right(fig3)
# get CLR values from crumblr, but don't regularize weights
cobj = crumblr(cellCounts(pb[,keep]), max.ratio=Inf)
library(ggbeeswarm)
library(ggrepel)
CT = "CD4+ Th17"
df = data.frame(CLR = cobj$E[CT,],
TB_status = pb$TB_status[keep],
se = 1/sqrt(cobj$weights[CT,]),
counts = cellCounts(pb[,keep])[,CT],
totalCells = rowSums(cellCounts(pb[,keep])),
colData(pb[,keep]))
# points to highlight
df_count1 = df %>%
filter(counts > 0) %>%
arrange(-se) %>%
head(2) %>%
select(CLR, TB_status, se, counts, totalCells)
df_count2 = df %>%
arrange(se) %>%
head(2) %>%
select(CLR, TB_status, se, counts, totalCells)
df_count = rbind(df_count1, df_count2)
n.skip = sum(scale(df$se) > 3)
zmax = sort(df$se, decreasing=TRUE)[n.skip+1]
df %>%
arrange(se) %>%
ggplot(aes(TB_status, CLR, color=pmin(zmax, se))) +
geom_boxplot(width=.3) +
geom_beeswarm(cex = 2, size=3) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5), aspect.ratio=1, axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ggtitle(CT) +
scale_color_gradient2(low="grey", high="red", limits= c(0,zmax*1.01), name = "Standard error") +
geom_text_repel(data=df_count, aes(TB_status, CLR, label=paste(counts, totalCells, sep=' / ')), color="black", box.padding=2)
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 9.4 (Blue Onyx)
##
## Matrix products: default
## BLAS: /hpc/packages/minerva-rocky9/R/4.4.1/lib64/R/lib/libRblas.so
## LAPACK: /hpc/packages/minerva-rocky9/R/4.4.1/lib64/R/lib/libRlapack.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] ggrepel_0.9.6 ggbeeswarm_0.7.2
## [3] DelayedArray_0.32.0 SparseArray_1.6.0
## [5] S4Arrays_1.6.0 abind_1.4-8
## [7] Matrix_1.7-1 RColorBrewer_1.1-3
## [9] ggcorrplot_0.1.4.1 kableExtra_1.4.0
## [11] dreamlet_1.4.1 variancePartition_1.36.3
## [13] BiocParallel_1.40.0 limma_3.62.1
## [15] lubridate_1.9.3 forcats_1.0.0
## [17] stringr_1.5.1 dplyr_1.1.4
## [19] purrr_1.0.2 readr_2.1.5
## [21] tidyr_1.3.1 tibble_3.2.1
## [23] tidyverse_2.0.0 aplot_0.2.4
## [25] crumblr_0.99.11 ggtree_3.10.1
## [27] scattermore_1.2 ggplot2_3.5.1
## [29] zellkonverter_1.16.0 SingleCellExperiment_1.28.1
## [31] SummarizedExperiment_1.36.0 Biobase_2.66.0
## [33] GenomicRanges_1.58.0 GenomeInfoDb_1.42.1
## [35] IRanges_2.40.0 S4Vectors_0.44.0
## [37] BiocGenerics_0.52.0 MatrixGenerics_1.18.0
## [39] matrixStats_1.4.1
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.5 bitops_1.0-9
## [3] httr_1.4.7 Rgraphviz_2.50.0
## [5] numDeriv_2016.8-1.1 tools_4.4.1
## [7] backports_1.5.0 R6_2.5.1
## [9] metafor_4.6-0 HDF5Array_1.34.0
## [11] lazyeval_0.2.2 rhdf5filters_1.18.0
## [13] withr_3.0.2 prettyunits_1.2.0
## [15] gridExtra_2.3 cli_3.6.3
## [17] labeling_0.4.3 sass_0.4.9
## [19] KEGGgraph_1.66.0 SQUAREM_2021.1
## [21] mvtnorm_1.3-2 proxy_0.4-27
## [23] mixsqp_0.3-54 systemfonts_1.1.0
## [25] yulab.utils_0.1.8 svglite_2.1.3
## [27] zenith_1.8.0 invgamma_1.1
## [29] rstudioapi_0.17.1 RSQLite_2.3.8
## [31] generics_0.1.3 gridGraphics_0.5-1
## [33] gtools_3.9.5 metadat_1.2-0
## [35] lifecycle_1.0.4 yaml_2.3.10
## [37] edgeR_4.4.1 mathjaxr_1.6-0
## [39] pamr_1.57 rhdf5_2.50.0
## [41] gplots_3.2.0 grid_4.4.1
## [43] blob_1.2.4 crayon_1.5.3
## [45] dir.expiry_1.14.0 lattice_0.22-6
## [47] msigdbr_7.5.1 annotate_1.84.0
## [49] KEGGREST_1.46.0 pillar_1.10.0
## [51] knitr_1.49 tcltk_4.4.1
## [53] boot_1.3-31 corpcor_1.6.10
## [55] codetools_0.2-20 glue_1.8.0
## [57] ggfun_0.1.8 data.table_1.16.4
## [59] vctrs_0.6.5 png_0.1-8
## [61] treeio_1.26.0 Rdpack_2.6.2
## [63] gtable_0.3.6 assertthat_0.2.1
## [65] cachem_1.1.0 xfun_0.49
## [67] rbibutils_2.3 Rfast_2.1.0
## [69] survival_3.7-0 iterators_1.0.14
## [71] statmod_1.5.0 nlme_3.1-166
## [73] tsne_0.1-3.1 pbkrtest_0.5.3
## [75] bit64_4.5.2 progress_1.2.3
## [77] EnvStats_3.0.0 filelock_1.0.3
## [79] bslib_0.8.0 irlba_2.3.5.1
## [81] vipor_0.4.7 KernSmooth_2.23-24
## [83] colorspace_2.1-1 rmeta_3.0
## [85] DBI_1.2.3 tidyselect_1.2.1
## [87] bit_4.5.0 compiler_4.4.1
## [89] graph_1.84.0 basilisk.utils_1.18.0
## [91] xml2_1.3.6 tcltk2_1.2-11
## [93] scales_1.3.0 caTools_1.18.3
## [95] remaCor_0.0.18 digest_0.6.37
## [97] minqa_1.2.8 rmarkdown_2.29
## [99] basilisk_1.18.0 aod_1.3.3
## [101] XVector_0.46.0 RhpcBLASctl_0.23-42
## [103] htmltools_0.5.8.1 pkgconfig_2.0.3
## [105] lme4_1.1-35.5 sparseMatrixStats_1.18.0
## [107] mashr_0.2.79 fastmap_1.2.0
## [109] rlang_1.1.4 UCSC.utils_1.2.0
## [111] DelayedMatrixStats_1.28.0 farver_2.1.2
## [113] jquerylib_0.1.4 jsonlite_1.8.9
## [115] RCurl_1.98-1.16 magrittr_2.0.3
## [117] GenomeInfoDbData_1.2.13 ggplotify_0.1.2
## [119] patchwork_1.3.0 Rhdf5lib_1.28.0
## [121] munsell_0.5.1 Rcpp_1.0.13-1
## [123] ape_5.8-1 babelgene_22.9
## [125] viridis_0.6.5 reticulate_1.40.0
## [127] EnrichmentBrowser_2.36.0 RcppZiggurat_0.1.6
## [129] stringi_1.8.4 zlibbioc_1.52.0
## [131] MASS_7.3-61 plyr_1.8.9
## [133] parallel_4.4.1 Biostrings_2.74.0
## [135] splines_4.4.1 hms_1.1.3
## [137] locfit_1.5-9.10 reshape2_1.4.4
## [139] XML_3.99-0.17 evaluate_1.0.1
## [141] RcppParallel_5.1.9 nloptr_2.1.1
## [143] tzdb_0.4.0 ashr_2.2-63
## [145] broom_1.0.7 xtable_1.8-4
## [147] fANCOVA_0.6-1 e1071_1.7-16
## [149] tidytree_0.4.6 viridisLite_0.4.2
## [151] class_7.3-22 truncnorm_1.0-9
## [153] stylo_0.7.5 lmerTest_3.1-3
## [155] memoise_2.0.1 beeswarm_0.4.0
## [157] AnnotationDbi_1.68.0 cluster_2.1.6
## [159] timechange_0.3.0 GSEABase_1.68.0