Load packages

library(SingleCellExperiment)
library(zellkonverter)
library(DelayedArray)
library(dreamlet)
library(scater)
library(GSEABase)
library(zenith)
library(ggrepel)
library(knitr)
library(kableExtra)
library(scattermore)
library(cowplot)
library(ggplot2)
library(viridis)
library(qvalue)
library(tidyverse)
library(RColorBrewer)
library(BiocParallel)
library(arrow)
library(formula.tools)

Load data

# read H5AD file
file = "/sc/arion/projects/CommonMind/leed62/ref/published/2023_Mathys_Cell_snRNA/cell_browser/ad-aging-brain/240321_2023_Mathys_Cell_snRNA_cb_final_only_counts.h5ad"
sce = readH5AD(file, use_hdf5=TRUE, verbose=TRUE)#, obsm=FALSE, uns=FALSE, layers=FALSE, var=FALSE, obs=FALSE)   
counts(sce) = assay(sce, "X")

sce$Cell_Type = factor(sce$Cell_Type, sort(levels(sce$Cell_Type)))

# create pseudobulk 
pb <- aggregateToPseudoBulk(sce,
    assay = "counts",     
    cluster_id = "Major_Cell_Type",  
    sample_id = "individualID")

UMAP embedding

plotProjection(sce, "X_umap", annotation='Major_Cell_Type') 

Properties of dataset

df = data.frame(table(pb$ADdiag2types))
colnames(df) = c("Disease status", "Count")

df %>%
  kbl(row.names=FALSE) %>% 
  kable_classic(full_width = FALSE) 
Disease status Count
AD 205
NA 3
nonAD 219
df = data.frame(table(pb$ADdiag3types))
colnames(df) = c("Disease status", "Count")

df %>%
  kbl(row.names=FALSE) %>% 
  kable_classic(full_width = FALSE) 
Disease status Count
NA 3
earlyAD 132
lateAD 73
nonAD 219
df = list()
df$`# Samples` = ncol(pb)
df$`# Subjects` = nlevels(sce$Individual)
df$`# Cells` = ncol(sce)
df = unlist(df)
df = data.frame(Propert=names(df), count = df)

df %>%  
  kbl(row.names=FALSE) %>% 
  kable_classic(full_width = FALSE) 
Propert count
# Samples 427
# Subjects 427
# Cells 2327742

Process pseudobulk data to estimate precision weights

types = c("nonAD", "AD" )
pb$ADdiag2types = factor(pb$ADdiag2types, types)

types = c("nonAD", "earlyAD", "lateAD" )
pb$ADdiag3types = factor(pb$ADdiag3types, types)

# Normalize and apply voomWithDreamWeights
form = ~ age + sex + Study + pmi
res.proc = processAssays( pb, 
                          form,  
                          min.cells = 2,
                          min.count = 1,
                          min.samples = 4,
                          min.prop = 0.3,
                          BPPARAM = SnowParam(6))

Show details of processing

details(res.proc) 
##   assay n_retain                  formula formDropsTerms n_genes n_errors
## 1   Ast      422 ~age + sex + Study + pmi          FALSE   13835        0
## 2   Exc      422 ~age + sex + Study + pmi          FALSE   16669        0
## 3   Inh      415 ~age + sex + Study + pmi          FALSE   15492        0
## 4   Mic      419 ~age + sex + Study + pmi          FALSE   12160        0
## 5   Oli      422 ~age + sex + Study + pmi          FALSE   14692        0
## 6   Opc      422 ~age + sex + Study + pmi          FALSE   13432        0
## 7   Vas      397 ~age + sex + Study + pmi          FALSE    9714        0
##   error_initial
## 1         FALSE
## 2         FALSE
## 3         FALSE
## 4         FALSE
## 5         FALSE
## 6         FALSE
## 7         FALSE

Variance partitioning analysis

# run variance partitioning analysis
form = ~ age + sex + Study + pmi + ADdiag2types 
vp.lst = fitVarPart( res.proc, form, BPPARAM = SnowParam(6, progressbar=TRUE))
# Summarize variance fractions genome-wide for each cell type
plotVarPart(sortCols(vp.lst), label.angle=60, ncol=4)   

dreamlet

form = ~ age + sex + Study + pmi + ADdiag2types 
fit = dreamlet(res.proc, form) 

Volcano

plotVolcano(fit, "ADdiag2typesAD")

topTable(fit, coef = "ADdiag2typesAD", number=Inf) %>%
  write.table(file="Mathys_2023_Major_Cell_Type_ADdiag2typesAD.tsv", quote=FALSE, sep="\t", row.names=FALSE)

Highlight genes

ctorder = assayNames(fit)

genes = c("DUSP10", "PTPRG", 'NPNT', 'DPYD', "VGF", 'SPRY4-AS1', "PDE10A", "NCAM2", 'PCDH7') 
fig.gene_heatmap = plotGeneHeatmap( fit, coef="ADdiag2typesAD", genes=genes, assays=ctorder)  
fig.gene_heatmap

figList = lapply(genes, function(g){  
  plotForest( fit, coef = 'ADdiag2typesAD', gene = g, assays=ctorder) +
    theme(aspect.ratio=1, legend.position="none") 
}) 
names(figList) = genes
plot_grid(plotlist=figList, ncol=3)

gene_CT = c("PTPRG" = "Micro_PVM", 'NPNT' = "Astro", 'DPYD'= "Micro_PVM", 'SPRY4-AS1' = "EN_L3_5_IT_3", "PDE10A" = "EN_L6_IT", "NCAM2" = "EN_L3_5_IT_2", 'PCDH7' = "IN_LAMP5")

figList = lapply( names(gene_CT), function(g){

  df = extractData(res.proc, gene_CT[g])

  df = df[,c('Dx', g)]
  colnames(df)[colnames(df)==g] = "expr"

  ggplot(df, aes(Dx, expr, fill=Dx)) +
          geom_boxplot() +
          theme_classic() +
          theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5), legend.position="none") +
          ggtitle(paste0(gene_CT[g], ": ", g)) +
          ylab(bquote(log[2]~CPM)) +
          scale_fill_manual(values=c("grey50", "red3"))
})
names(figList) = names(gene_CT)

plot_grid(plotlist=figList, ncol=3)

Summarize differential expression

# Summarize differential expression for each coef and assay
df_de = fit %>%
  topTable(coef='ADdiag2typesAD', number=Inf) %>%
    as_tibble %>% 
    group_by(assay) %>% 
    summarise( 
      nGenes = length(adj.P.Val), 
      nDE = sum(adj.P.Val < 0.05),
      pi1 = 1 - qvalue(P.Value)$pi0) %>%
  mutate(assay = factor(assay, ctorder))  

ymax = 1.05*max(df_de$nGenes)
fig1 = ggplot(df_de, aes(nGenes, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none") +
    scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
    xlab("# genes expressed") +
    ylab("Cell type") 

ymax = max(1.05*max(df_de$nDE), 100)
fig2 = ggplot(df_de, aes(nDE, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
    scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
    xlab("# genes with FDR < 5%") +
    ylab('')

fig3 = ggplot(df_de, aes(pi1, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
    scale_x_continuous(limits=c(0,1), expand=c(0,0)) +
    xlab(bquote(pi[1]))+
    ylab('')

plot_grid(fig1, fig2, fig3, labels=LETTERS[1:3], nrow=1, axis="tblr", align="hv")

fig_de_count = ggplot(df_de, aes(2, assay)) +
        geom_tile(fill="white") +
        geom_point(aes(color=nDE/nGenes*100, size=nGenes)) +
        scale_color_viridis(name = "% genes\npassing FDR", lim=c(0, NA)) +
        scale_size_continuous(name = "# genes", breaks=c(1, 4, 8, 12, 16)*1000) +
        theme_classic() +
        theme(aspect.ratio=as.numeric(nrow(df_de)), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
        xlab('') + ylab('') 
fig_de_count

# Number of cells observed per Subject
df = colData(sce) %>%
  xtabs( ~ Individual + Major_Cell_Type,.) %>%
  as_tibble %>%
  pivot_longer(cols=Individual) %>%
  mutate(assay = Major_Cell_Type) %>%
  group_by(assay) %>%
  summarize(sum.cells = sum(n)) 
df2 = df_vp %>%
      as_tibble %>%
      mutate(Donor = Dx + sex + Individual + Age) %>%
      group_by(assay) %>%
      summarize(mean.vp = mean(Donor))

df3 = inner_join(df, df2, by='assay') %>%
        inner_join(details(res.proc) %>% as_tibble) %>%
        mutate(assay = factor(assay, ctorder)) 

fit_n.cells = lm(mean.vp ~ log10(sum.cells/n_retain), df3)#, weights=df3$n_retain)
# summary(fit_n.cells)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)

fig.vp_ncells = ggplot(df3, aes(sum.cells/n_retain, mean.vp, color=assay, size=n_retain, label=assay)) +
  geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
  geom_point() +
  scale_x_log10() +
  theme_classic() +
  theme(aspect.ratio=1) + 
  scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
  xlab("Mean # cells per Subject") +
  ylab("Mean % variance across Subject") +
  geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") +
  annotate("text", x=40, y=.7, label=bquote(p==.(pv)))
fig.vp_ncells
df3 = inner_join(df, df_de, by='assay') %>%
        inner_join(details(res.proc) %>% as_tibble) %>%
        mutate(assay = factor(assay, ctorder)) 

fit_n.cells = lm(nDE ~ log10(sum.cells/n_retain), df3)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)

fig.nDE_ncells = ggplot(df3, aes(sum.cells/n_retain, nDE, color=assay, size=n_retain, label=assay)) +
  geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
  geom_point() +
  scale_x_log10() +
  theme_classic() +
  theme(aspect.ratio=1) + 
  scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
  xlab("Mean # nuclei per Subject") +
  ylab("# differentially expressed genes") +
  geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") +
  annotate("text", x=1000, y=.7, label=bquote(p==.(pv)))
fig.nDE_ncells

Gene set analysis

# Load Gene Ontology database 
go.gs = get_GeneOntology(to="SYMBOL")

# remove set with > 1000 genes
go.gs = go.gs[sapply(go.gs, function(x) length(geneIds(x))) < 1000]

# zenith analysis 
res.gsa = zenith_gsa(fit, go.gs, coefs="ADdiag2typesAD")
# remove "GO0022625: " from gene set name
res.gsa$Geneset = gsub("^GO\\S+ ", "", res.gsa$Geneset)
res.gsa$assay = factor(res.gsa$assay, ctorder)  

# plotZenithResults(res.gsa, 10, 3) + theme(legend.position="bottom") 
# get genesets with FDR < 5%
gs = unique(res.gsa$Geneset[res.gsa$FDR < 0.05])

# keep only results of these genesets
df = res.gsa[res.gsa$Geneset %in% gs,]

# plot results, but with no limit based on the highest/lowest t-statistic
plotZenithResults(df, Inf, Inf) 

All phenotypes

# Scaled is reversed
colData(res.proc)$CERAD = 5 - colData(res.proc)$ceradsc

traits = c("ADdiag2types", "CERAD", 'ADdiag3types', 'braaksc', 'cogdx', "ordered(CERAD)", 'ordered(ADdiag3types)', 'ordered(braaksc)', 'ordered(cogdx)')

form.base = ~ age + sex + Study + pmi 

fitList = lapply( traits, function(trait){
  form = as.formula(paste(as.character(form.base), "+", trait))

  fit = dreamlet(res.proc, form) 
})
names(fitList) = traits
saveRDS(fitList, file="Mathys_Cell_2023_Major_Cell_Type_dreamlet_fit.RDS")
df_all = lapply(names(fitList), function(trait){

  fit = fitList[[trait]]
  coefSet = coefNames(fit)[-seq(5)]

  tab = topTable(fit, coef=coefSet, number=Inf)
  tab$trait = gsub("ordered\\((\\S+)\\)", "\\1", trait)
  tab$coefSet = paste(coefSet, collapse=' + ')
  tab = as_tibble(tab) 

if( length(coefSet) == 1){    
    cols = c("assay", "ID", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "trait", "coefSet")
    tab = tab %>% 
      select(all_of(cols)) %>%
      rename(stat = "t")
  }else{
    cols = c("assay", "ID", "logFC", "AveExpr", "F", "P.Value", "adj.P.Val","trait", "coefSet")
    tab = tab %>% 
      mutate(logFC = NA) %>%
      select(all_of(cols)) %>%
      rename(stat = "F")
  }

}) %>% 
  bind_rows %>%
  arrange(trait, assay, ID, adj.P.Val)

write_parquet(df_all, "Mathys_Cell_2023_Major_Cell_Type_dreamlet.parquet")

Session Info

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-centos7/oneAPI/p_2024.1.0.560/toolkits/mkl/2024.1/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] formula.tools_1.7.1         arrow_15.0.1               
##  [3] RColorBrewer_1.1-3          lubridate_1.9.3            
##  [5] forcats_1.0.0               stringr_1.5.1              
##  [7] dplyr_1.1.4                 purrr_1.0.2                
##  [9] readr_2.1.5                 tidyr_1.3.1                
## [11] tibble_3.2.1                tidyverse_2.0.0            
## [13] qvalue_2.34.0               viridis_0.6.5              
## [15] viridisLite_0.4.2           cowplot_1.1.3              
## [17] scattermore_1.2             kableExtra_1.4.0           
## [19] knitr_1.45                  ggrepel_0.9.5              
## [21] zenith_1.4.2                GSEABase_1.64.0            
## [23] graph_1.80.0                annotate_1.80.0            
## [25] XML_3.99-0.16.1             AnnotationDbi_1.64.1       
## [27] scater_1.30.1               scuttle_1.12.0             
## [29] dreamlet_1.1.23             variancePartition_1.33.15  
## [31] BiocParallel_1.36.0         limma_3.58.1               
## [33] ggplot2_3.5.1               DelayedArray_0.28.0        
## [35] SparseArray_1.2.4           S4Arrays_1.2.1             
## [37] abind_1.4-5                 Matrix_1.6-5               
## [39] zellkonverter_1.13.3        SingleCellExperiment_1.24.0
## [41] SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [43] GenomicRanges_1.54.1        GenomeInfoDb_1.38.8        
## [45] IRanges_2.36.0              S4Vectors_0.40.2           
## [47] BiocGenerics_0.48.1         MatrixGenerics_1.14.0      
## [49] matrixStats_1.3.0          
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.3             bitops_1.0-7             
##   [3] filelock_1.0.3            basilisk.utils_1.14.1    
##   [5] lifecycle_1.0.4           Rdpack_2.6               
##   [7] mixsqp_0.3-54             edgeR_4.0.16             
##   [9] lattice_0.22-5            MASS_7.3-60.0.1          
##  [11] backports_1.4.1           magrittr_2.0.3           
##  [13] metafor_4.6-0             sass_0.4.9               
##  [15] rmarkdown_2.26            jquerylib_0.1.4          
##  [17] yaml_2.3.8                reticulate_1.36.1        
##  [19] DBI_1.2.2                 minqa_1.2.6              
##  [21] zlibbioc_1.48.2           EnvStats_2.8.1           
##  [23] rmeta_3.0                 msigdbr_7.5.1            
##  [25] RCurl_1.98-1.14           GenomeInfoDbData_1.2.11  
##  [27] pbkrtest_0.5.2            irlba_2.3.5.1            
##  [29] svglite_2.1.3             DelayedMatrixStats_1.24.0
##  [31] codetools_0.2-19          xml2_1.3.6               
##  [33] tidyselect_1.2.1          farver_2.1.2             
##  [35] ScaledMatrix_1.10.0       lme4_1.1-35.2            
##  [37] mathjaxr_1.6-0            jsonlite_1.8.8           
##  [39] BiocNeighbors_1.20.2      iterators_1.0.14         
##  [41] systemfonts_1.1.0         tools_4.3.3              
##  [43] progress_1.2.3            Rcpp_1.0.12              
##  [45] glue_1.7.0                gridExtra_2.3            
##  [47] xfun_0.43                 withr_3.0.0              
##  [49] numDeriv_2016.8-1.1       fastmap_1.2.0            
##  [51] basilisk_1.14.3           boot_1.3-29              
##  [53] fansi_1.0.6               rsvd_1.0.5               
##  [55] caTools_1.18.2            digest_0.6.35            
##  [57] truncnorm_1.0-9           timechange_0.3.0         
##  [59] R6_2.5.1                  colorspace_2.1-0         
##  [61] gtools_3.9.5              RSQLite_2.3.6            
##  [63] RhpcBLASctl_0.23-42       utf8_1.2.4               
##  [65] generics_0.1.3            data.table_1.15.4        
##  [67] corpcor_1.6.10            prettyunits_1.2.0        
##  [69] httr_1.4.7                pkgconfig_2.0.3          
##  [71] gtable_0.3.5              blob_1.2.4               
##  [73] XVector_0.42.0            remaCor_0.0.18           
##  [75] htmltools_0.5.8.1         scales_1.3.0             
##  [77] png_0.1-8                 fANCOVA_0.6-1            
##  [79] ashr_2.2-63               rstudioapi_0.16.0        
##  [81] tzdb_0.4.0                reshape2_1.4.4           
##  [83] nlme_3.1-164              nloptr_2.0.3             
##  [85] cachem_1.1.0              operator.tools_1.6.3     
##  [87] KernSmooth_2.23-22        parallel_4.3.3           
##  [89] vipor_0.4.7               metadat_1.2-0            
##  [91] RcppZiggurat_0.1.6        pillar_1.9.0             
##  [93] grid_4.3.3                vctrs_0.6.5              
##  [95] gplots_3.1.3.1            BiocSingular_1.18.0      
##  [97] beachmat_2.18.1           mashr_0.2.79             
##  [99] xtable_1.8-4              beeswarm_0.4.0           
## [101] Rgraphviz_2.46.0          evaluate_0.23            
## [103] KEGGgraph_1.62.0          invgamma_1.1             
## [105] mvtnorm_1.2-4             cli_3.6.2                
## [107] locfit_1.5-9.9            compiler_4.3.3           
## [109] rlang_1.1.3               crayon_1.5.2             
## [111] SQUAREM_2021.1            labeling_0.4.3           
## [113] plyr_1.8.9                ggbeeswarm_0.7.2         
## [115] stringi_1.8.4             assertthat_0.2.1         
## [117] babelgene_22.9            lmerTest_3.1-3           
## [119] munsell_0.5.1             Biostrings_2.70.3        
## [121] aod_1.3.3                 dir.expiry_1.10.0        
## [123] hms_1.1.3                 sparseMatrixStats_1.14.0 
## [125] bit64_4.0.5               KEGGREST_1.42.0          
## [127] statmod_1.5.0             highr_0.10               
## [129] rbibutils_2.2.16          Rfast_2.1.0              
## [131] broom_1.0.6               memoise_2.0.1            
## [133] RcppParallel_5.1.7        bslib_0.7.0              
## [135] bit_4.0.5                 EnrichmentBrowser_2.32.0