library(SingleCellExperiment)
library(zellkonverter)
library(DelayedArray)
library(dreamlet)
library(scater)
library(GSEABase)
library(zenith)
library(ggrepel)
library(knitr)
library(kableExtra)
library(scattermore)
library(cowplot)
library(ggplot2)
library(viridis)
library(qvalue)
library(tidyverse)
library(RColorBrewer)
library(BiocParallel)
library(arrow)
library(formula.tools)
# read H5AD file
file = "/sc/arion/projects/CommonMind/leed62/ref/published/2023_Mathys_Cell_snRNA/cell_browser/ad-aging-brain/240321_2023_Mathys_Cell_snRNA_cb_final_only_counts.h5ad"
sce = readH5AD(file, use_hdf5=TRUE, verbose=TRUE)#, obsm=FALSE, uns=FALSE, layers=FALSE, var=FALSE, obs=FALSE)
counts(sce) = assay(sce, "X")
sce$Cell_Type = factor(sce$Cell_Type, sort(levels(sce$Cell_Type)))
# create pseudobulk
pb <- aggregateToPseudoBulk(sce,
assay = "counts",
cluster_id = "Major_Cell_Type",
sample_id = "individualID")
plotProjection(sce, "X_umap", annotation='Major_Cell_Type')
df = data.frame(table(pb$ADdiag2types))
colnames(df) = c("Disease status", "Count")
df %>%
kbl(row.names=FALSE) %>%
kable_classic(full_width = FALSE)
| Disease status | Count |
|---|---|
| AD | 205 |
| NA | 3 |
| nonAD | 219 |
df = data.frame(table(pb$ADdiag3types))
colnames(df) = c("Disease status", "Count")
df %>%
kbl(row.names=FALSE) %>%
kable_classic(full_width = FALSE)
| Disease status | Count |
|---|---|
| NA | 3 |
| earlyAD | 132 |
| lateAD | 73 |
| nonAD | 219 |
df = list()
df$`# Samples` = ncol(pb)
df$`# Subjects` = nlevels(sce$Individual)
df$`# Cells` = ncol(sce)
df = unlist(df)
df = data.frame(Propert=names(df), count = df)
df %>%
kbl(row.names=FALSE) %>%
kable_classic(full_width = FALSE)
| Propert | count |
|---|---|
| # Samples | 427 |
| # Subjects | 427 |
| # Cells | 2327742 |
types = c("nonAD", "AD" )
pb$ADdiag2types = factor(pb$ADdiag2types, types)
types = c("nonAD", "earlyAD", "lateAD" )
pb$ADdiag3types = factor(pb$ADdiag3types, types)
# Normalize and apply voomWithDreamWeights
form = ~ age + sex + Study + pmi
res.proc = processAssays( pb,
form,
min.cells = 2,
min.count = 1,
min.samples = 4,
min.prop = 0.3,
BPPARAM = SnowParam(6))
details(res.proc)
## assay n_retain formula formDropsTerms n_genes n_errors
## 1 Ast 422 ~age + sex + Study + pmi FALSE 13835 0
## 2 Exc 422 ~age + sex + Study + pmi FALSE 16669 0
## 3 Inh 415 ~age + sex + Study + pmi FALSE 15492 0
## 4 Mic 419 ~age + sex + Study + pmi FALSE 12160 0
## 5 Oli 422 ~age + sex + Study + pmi FALSE 14692 0
## 6 Opc 422 ~age + sex + Study + pmi FALSE 13432 0
## 7 Vas 397 ~age + sex + Study + pmi FALSE 9714 0
## error_initial
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
# show voom plot for each cell clusters
plotVoom( res.proc, ncol=4)
# run variance partitioning analysis
form = ~ age + sex + Study + pmi + ADdiag2types
vp.lst = fitVarPart( res.proc, form, BPPARAM = SnowParam(6, progressbar=TRUE))
# Summarize variance fractions genome-wide for each cell type
plotVarPart(sortCols(vp.lst), label.angle=60, ncol=4)
form = ~ age + sex + Study + pmi + ADdiag2types
fit = dreamlet(res.proc, form)
plotVolcano(fit, "ADdiag2typesAD")
topTable(fit, coef = "ADdiag2typesAD", number=Inf) %>%
write.table(file="Mathys_2023_Major_Cell_Type_ADdiag2typesAD.tsv", quote=FALSE, sep="\t", row.names=FALSE)
ctorder = assayNames(fit)
genes = c("DUSP10", "PTPRG", 'NPNT', 'DPYD', "VGF", 'SPRY4-AS1', "PDE10A", "NCAM2", 'PCDH7')
fig.gene_heatmap = plotGeneHeatmap( fit, coef="ADdiag2typesAD", genes=genes, assays=ctorder)
fig.gene_heatmap
figList = lapply(genes, function(g){
plotForest( fit, coef = 'ADdiag2typesAD', gene = g, assays=ctorder) +
theme(aspect.ratio=1, legend.position="none")
})
names(figList) = genes
plot_grid(plotlist=figList, ncol=3)
gene_CT = c("PTPRG" = "Micro_PVM", 'NPNT' = "Astro", 'DPYD'= "Micro_PVM", 'SPRY4-AS1' = "EN_L3_5_IT_3", "PDE10A" = "EN_L6_IT", "NCAM2" = "EN_L3_5_IT_2", 'PCDH7' = "IN_LAMP5")
figList = lapply( names(gene_CT), function(g){
df = extractData(res.proc, gene_CT[g])
df = df[,c('Dx', g)]
colnames(df)[colnames(df)==g] = "expr"
ggplot(df, aes(Dx, expr, fill=Dx)) +
geom_boxplot() +
theme_classic() +
theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5), legend.position="none") +
ggtitle(paste0(gene_CT[g], ": ", g)) +
ylab(bquote(log[2]~CPM)) +
scale_fill_manual(values=c("grey50", "red3"))
})
names(figList) = names(gene_CT)
plot_grid(plotlist=figList, ncol=3)
# Summarize differential expression for each coef and assay
df_de = fit %>%
topTable(coef='ADdiag2typesAD', number=Inf) %>%
as_tibble %>%
group_by(assay) %>%
summarise(
nGenes = length(adj.P.Val),
nDE = sum(adj.P.Val < 0.05),
pi1 = 1 - qvalue(P.Value)$pi0) %>%
mutate(assay = factor(assay, ctorder))
ymax = 1.05*max(df_de$nGenes)
fig1 = ggplot(df_de, aes(nGenes, assay, fill=assay)) +
geom_bar(stat="identity") +
theme_classic() +
theme(aspect.ratio=1, legend.position="none") +
scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
xlab("# genes expressed") +
ylab("Cell type")
ymax = max(1.05*max(df_de$nDE), 100)
fig2 = ggplot(df_de, aes(nDE, assay, fill=assay)) +
geom_bar(stat="identity") +
theme_classic() +
theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
xlab("# genes with FDR < 5%") +
ylab('')
fig3 = ggplot(df_de, aes(pi1, assay, fill=assay)) +
geom_bar(stat="identity") +
theme_classic() +
theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
scale_x_continuous(limits=c(0,1), expand=c(0,0)) +
xlab(bquote(pi[1]))+
ylab('')
plot_grid(fig1, fig2, fig3, labels=LETTERS[1:3], nrow=1, axis="tblr", align="hv")
fig_de_count = ggplot(df_de, aes(2, assay)) +
geom_tile(fill="white") +
geom_point(aes(color=nDE/nGenes*100, size=nGenes)) +
scale_color_viridis(name = "% genes\npassing FDR", lim=c(0, NA)) +
scale_size_continuous(name = "# genes", breaks=c(1, 4, 8, 12, 16)*1000) +
theme_classic() +
theme(aspect.ratio=as.numeric(nrow(df_de)), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
xlab('') + ylab('')
fig_de_count
# Number of cells observed per Subject
df = colData(sce) %>%
xtabs( ~ Individual + Major_Cell_Type,.) %>%
as_tibble %>%
pivot_longer(cols=Individual) %>%
mutate(assay = Major_Cell_Type) %>%
group_by(assay) %>%
summarize(sum.cells = sum(n))
df2 = df_vp %>%
as_tibble %>%
mutate(Donor = Dx + sex + Individual + Age) %>%
group_by(assay) %>%
summarize(mean.vp = mean(Donor))
df3 = inner_join(df, df2, by='assay') %>%
inner_join(details(res.proc) %>% as_tibble) %>%
mutate(assay = factor(assay, ctorder))
fit_n.cells = lm(mean.vp ~ log10(sum.cells/n_retain), df3)#, weights=df3$n_retain)
# summary(fit_n.cells)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)
fig.vp_ncells = ggplot(df3, aes(sum.cells/n_retain, mean.vp, color=assay, size=n_retain, label=assay)) +
geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
geom_point() +
scale_x_log10() +
theme_classic() +
theme(aspect.ratio=1) +
scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
xlab("Mean # cells per Subject") +
ylab("Mean % variance across Subject") +
geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") +
annotate("text", x=40, y=.7, label=bquote(p==.(pv)))
fig.vp_ncells
df3 = inner_join(df, df_de, by='assay') %>%
inner_join(details(res.proc) %>% as_tibble) %>%
mutate(assay = factor(assay, ctorder))
fit_n.cells = lm(nDE ~ log10(sum.cells/n_retain), df3)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)
fig.nDE_ncells = ggplot(df3, aes(sum.cells/n_retain, nDE, color=assay, size=n_retain, label=assay)) +
geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
geom_point() +
scale_x_log10() +
theme_classic() +
theme(aspect.ratio=1) +
scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
xlab("Mean # nuclei per Subject") +
ylab("# differentially expressed genes") +
geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") +
annotate("text", x=1000, y=.7, label=bquote(p==.(pv)))
fig.nDE_ncells
# Load Gene Ontology database
go.gs = get_GeneOntology(to="SYMBOL")
# remove set with > 1000 genes
go.gs = go.gs[sapply(go.gs, function(x) length(geneIds(x))) < 1000]
# zenith analysis
res.gsa = zenith_gsa(fit, go.gs, coefs="ADdiag2typesAD")
# remove "GO0022625: " from gene set name
res.gsa$Geneset = gsub("^GO\\S+ ", "", res.gsa$Geneset)
res.gsa$assay = factor(res.gsa$assay, ctorder)
# plotZenithResults(res.gsa, 10, 3) + theme(legend.position="bottom")
# get genesets with FDR < 5%
gs = unique(res.gsa$Geneset[res.gsa$FDR < 0.05])
# keep only results of these genesets
df = res.gsa[res.gsa$Geneset %in% gs,]
# plot results, but with no limit based on the highest/lowest t-statistic
plotZenithResults(df, Inf, Inf)
# Scaled is reversed
colData(res.proc)$CERAD = 5 - colData(res.proc)$ceradsc
traits = c("ADdiag2types", "CERAD", 'ADdiag3types', 'braaksc', 'cogdx', "ordered(CERAD)", 'ordered(ADdiag3types)', 'ordered(braaksc)', 'ordered(cogdx)')
form.base = ~ age + sex + Study + pmi
fitList = lapply( traits, function(trait){
form = as.formula(paste(as.character(form.base), "+", trait))
fit = dreamlet(res.proc, form)
})
names(fitList) = traits
saveRDS(fitList, file="Mathys_Cell_2023_Major_Cell_Type_dreamlet_fit.RDS")
df_all = lapply(names(fitList), function(trait){
fit = fitList[[trait]]
coefSet = coefNames(fit)[-seq(5)]
tab = topTable(fit, coef=coefSet, number=Inf)
tab$trait = gsub("ordered\\((\\S+)\\)", "\\1", trait)
tab$coefSet = paste(coefSet, collapse=' + ')
tab = as_tibble(tab)
if( length(coefSet) == 1){
cols = c("assay", "ID", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "trait", "coefSet")
tab = tab %>%
select(all_of(cols)) %>%
rename(stat = "t")
}else{
cols = c("assay", "ID", "logFC", "AveExpr", "F", "P.Value", "adj.P.Val","trait", "coefSet")
tab = tab %>%
mutate(logFC = NA) %>%
select(all_of(cols)) %>%
rename(stat = "F")
}
}) %>%
bind_rows %>%
arrange(trait, assay, ID, adj.P.Val)
write_parquet(df_all, "Mathys_Cell_2023_Major_Cell_Type_dreamlet.parquet")
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-centos7/oneAPI/p_2024.1.0.560/toolkits/mkl/2024.1/lib/libmkl_gf_lp64.so.2; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] formula.tools_1.7.1 arrow_15.0.1
## [3] RColorBrewer_1.1-3 lubridate_1.9.3
## [5] forcats_1.0.0 stringr_1.5.1
## [7] dplyr_1.1.4 purrr_1.0.2
## [9] readr_2.1.5 tidyr_1.3.1
## [11] tibble_3.2.1 tidyverse_2.0.0
## [13] qvalue_2.34.0 viridis_0.6.5
## [15] viridisLite_0.4.2 cowplot_1.1.3
## [17] scattermore_1.2 kableExtra_1.4.0
## [19] knitr_1.45 ggrepel_0.9.5
## [21] zenith_1.4.2 GSEABase_1.64.0
## [23] graph_1.80.0 annotate_1.80.0
## [25] XML_3.99-0.16.1 AnnotationDbi_1.64.1
## [27] scater_1.30.1 scuttle_1.12.0
## [29] dreamlet_1.1.23 variancePartition_1.33.15
## [31] BiocParallel_1.36.0 limma_3.58.1
## [33] ggplot2_3.5.1 DelayedArray_0.28.0
## [35] SparseArray_1.2.4 S4Arrays_1.2.1
## [37] abind_1.4-5 Matrix_1.6-5
## [39] zellkonverter_1.13.3 SingleCellExperiment_1.24.0
## [41] SummarizedExperiment_1.32.0 Biobase_2.62.0
## [43] GenomicRanges_1.54.1 GenomeInfoDb_1.38.8
## [45] IRanges_2.36.0 S4Vectors_0.40.2
## [47] BiocGenerics_0.48.1 MatrixGenerics_1.14.0
## [49] matrixStats_1.3.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.3 bitops_1.0-7
## [3] filelock_1.0.3 basilisk.utils_1.14.1
## [5] lifecycle_1.0.4 Rdpack_2.6
## [7] mixsqp_0.3-54 edgeR_4.0.16
## [9] lattice_0.22-5 MASS_7.3-60.0.1
## [11] backports_1.4.1 magrittr_2.0.3
## [13] metafor_4.6-0 sass_0.4.9
## [15] rmarkdown_2.26 jquerylib_0.1.4
## [17] yaml_2.3.8 reticulate_1.36.1
## [19] DBI_1.2.2 minqa_1.2.6
## [21] zlibbioc_1.48.2 EnvStats_2.8.1
## [23] rmeta_3.0 msigdbr_7.5.1
## [25] RCurl_1.98-1.14 GenomeInfoDbData_1.2.11
## [27] pbkrtest_0.5.2 irlba_2.3.5.1
## [29] svglite_2.1.3 DelayedMatrixStats_1.24.0
## [31] codetools_0.2-19 xml2_1.3.6
## [33] tidyselect_1.2.1 farver_2.1.2
## [35] ScaledMatrix_1.10.0 lme4_1.1-35.2
## [37] mathjaxr_1.6-0 jsonlite_1.8.8
## [39] BiocNeighbors_1.20.2 iterators_1.0.14
## [41] systemfonts_1.1.0 tools_4.3.3
## [43] progress_1.2.3 Rcpp_1.0.12
## [45] glue_1.7.0 gridExtra_2.3
## [47] xfun_0.43 withr_3.0.0
## [49] numDeriv_2016.8-1.1 fastmap_1.2.0
## [51] basilisk_1.14.3 boot_1.3-29
## [53] fansi_1.0.6 rsvd_1.0.5
## [55] caTools_1.18.2 digest_0.6.35
## [57] truncnorm_1.0-9 timechange_0.3.0
## [59] R6_2.5.1 colorspace_2.1-0
## [61] gtools_3.9.5 RSQLite_2.3.6
## [63] RhpcBLASctl_0.23-42 utf8_1.2.4
## [65] generics_0.1.3 data.table_1.15.4
## [67] corpcor_1.6.10 prettyunits_1.2.0
## [69] httr_1.4.7 pkgconfig_2.0.3
## [71] gtable_0.3.5 blob_1.2.4
## [73] XVector_0.42.0 remaCor_0.0.18
## [75] htmltools_0.5.8.1 scales_1.3.0
## [77] png_0.1-8 fANCOVA_0.6-1
## [79] ashr_2.2-63 rstudioapi_0.16.0
## [81] tzdb_0.4.0 reshape2_1.4.4
## [83] nlme_3.1-164 nloptr_2.0.3
## [85] cachem_1.1.0 operator.tools_1.6.3
## [87] KernSmooth_2.23-22 parallel_4.3.3
## [89] vipor_0.4.7 metadat_1.2-0
## [91] RcppZiggurat_0.1.6 pillar_1.9.0
## [93] grid_4.3.3 vctrs_0.6.5
## [95] gplots_3.1.3.1 BiocSingular_1.18.0
## [97] beachmat_2.18.1 mashr_0.2.79
## [99] xtable_1.8-4 beeswarm_0.4.0
## [101] Rgraphviz_2.46.0 evaluate_0.23
## [103] KEGGgraph_1.62.0 invgamma_1.1
## [105] mvtnorm_1.2-4 cli_3.6.2
## [107] locfit_1.5-9.9 compiler_4.3.3
## [109] rlang_1.1.3 crayon_1.5.2
## [111] SQUAREM_2021.1 labeling_0.4.3
## [113] plyr_1.8.9 ggbeeswarm_0.7.2
## [115] stringi_1.8.4 assertthat_0.2.1
## [117] babelgene_22.9 lmerTest_3.1-3
## [119] munsell_0.5.1 Biostrings_2.70.3
## [121] aod_1.3.3 dir.expiry_1.10.0
## [123] hms_1.1.3 sparseMatrixStats_1.14.0
## [125] bit64_4.0.5 KEGGREST_1.42.0
## [127] statmod_1.5.0 highr_0.10
## [129] rbibutils_2.2.16 Rfast_2.1.0
## [131] broom_1.0.6 memoise_2.0.1
## [133] RcppParallel_5.1.7 bslib_0.7.0
## [135] bit_4.0.5 EnrichmentBrowser_2.32.0