Load packages
Load data
UMAP embedding
- Properties of dataset
- Process pseudobulk data to estimate precision weights
  - Show details of processing
  - Show voom-style mean-variance trends
Variance partitioning analysis
dreamlet
Volcano
- Highlight genes
  - Summarize differential expression
- Gene set analysis
All phenotypes
Session Info

Load packages

library(SingleCellExperiment)
library(zellkonverter)
library(DelayedArray)
library(dreamlet)
library(scater)
library(GSEABase)
library(zenith)
library(ggrepel)
library(knitr)
library(kableExtra)
library(scattermore)
library(cowplot)
library(ggplot2)
library(viridis)
library(qvalue)
library(tidyverse)
library(RColorBrewer)
library(BiocParallel)
library(arrow)
library(formula.tools)

Load data

# read H5AD file
file = "/sc/arion/projects/CommonMind/leed62/ref/published/2023_Mathys_Cell_snRNA/cell_browser/ad-aging-brain/240321_2023_Mathys_Cell_snRNA_cb_final_only_counts.h5ad"
sce = readH5AD(file, use_hdf5=TRUE, verbose=TRUE)#, obsm=FALSE, uns=FALSE, layers=FALSE, var=FALSE, obs=FALSE)   
counts(sce) = assay(sce, "X")

sce$Cell_Type = factor(sce$Cell_Type, sort(levels(sce$Cell_Type)))

# create pseudobulk 
pb <- aggregateToPseudoBulk(sce,
    assay = "counts",     
    cluster_id = "Major_Cell_Type",  
    sample_id = "individualID")

UMAP embedding

plotProjection(sce, "X_umap", annotation='Major_Cell_Type')

Properties of dataset

df = data.frame(table(pb$ADdiag2types))
colnames(df) = c("Disease status", "Count")

df %>%
  kbl(row.names=FALSE) %>% 
  kable_classic(full_width = FALSE)

Disease status	Count
AD	205
NA	3
nonAD	219

df = data.frame(table(pb$ADdiag3types))
colnames(df) = c("Disease status", "Count")

df %>%
  kbl(row.names=FALSE) %>% 
  kable_classic(full_width = FALSE)

Disease status	Count
NA	3
earlyAD	132
lateAD	73
nonAD	219

df = list()
df$`# Samples` = ncol(pb)
df$`# Subjects` = nlevels(sce$Individual)
df$`# Cells` = ncol(sce)
df = unlist(df)
df = data.frame(Propert=names(df), count = df)

df %>%  
  kbl(row.names=FALSE) %>% 
  kable_classic(full_width = FALSE)

Propert	count
# Samples	427
# Subjects	427
# Cells	2327742

Process pseudobulk data to estimate precision weights

types = c("nonAD", "AD" )
pb$ADdiag2types = factor(pb$ADdiag2types, types)

types = c("nonAD", "earlyAD", "lateAD" )
pb$ADdiag3types = factor(pb$ADdiag3types, types)

# Normalize and apply voomWithDreamWeights
form = ~ age + sex + Study + pmi
res.proc = processAssays( pb, 
                          form,  
                          min.cells = 2,
                          min.count = 1,
                          min.samples = 4,
                          min.prop = 0.3,
                          BPPARAM = SnowParam(6))

Show details of processing

details(res.proc)

##   assay n_retain                  formula formDropsTerms n_genes n_errors
## 1   Ast      422 ~age + sex + Study + pmi          FALSE   13835        0
## 2   Exc      422 ~age + sex + Study + pmi          FALSE   16669        0
## 3   Inh      415 ~age + sex + Study + pmi          FALSE   15492        0
## 4   Mic      419 ~age + sex + Study + pmi          FALSE   12160        0
## 5   Oli      422 ~age + sex + Study + pmi          FALSE   14692        0
## 6   Opc      422 ~age + sex + Study + pmi          FALSE   13432        0
## 7   Vas      397 ~age + sex + Study + pmi          FALSE    9714        0
##   error_initial
## 1         FALSE
## 2         FALSE
## 3         FALSE
## 4         FALSE
## 5         FALSE
## 6         FALSE
## 7         FALSE

Show voom-style mean-variance trends

# show voom plot for each cell clusters  
plotVoom( res.proc, ncol=4)

Variance partitioning analysis

# run variance partitioning analysis
form = ~ age + sex + Study + pmi + ADdiag2types 
vp.lst = fitVarPart( res.proc, form, BPPARAM = SnowParam(6, progressbar=TRUE))

# Summarize variance fractions genome-wide for each cell type
plotVarPart(sortCols(vp.lst), label.angle=60, ncol=4)

dreamlet

form = ~ age + sex + Study + pmi + ADdiag2types 
fit = dreamlet(res.proc, form)

Volcano

plotVolcano(fit, "ADdiag2typesAD")

topTable(fit, coef = "ADdiag2typesAD", number=Inf) %>%
  write.table(file="Mathys_2023_Major_Cell_Type_ADdiag2typesAD.tsv", quote=FALSE, sep="\t", row.names=FALSE)

Highlight genes

ctorder = assayNames(fit)

genes = c("DUSP10", "PTPRG", 'NPNT', 'DPYD', "VGF", 'SPRY4-AS1', "PDE10A", "NCAM2", 'PCDH7') 
fig.gene_heatmap = plotGeneHeatmap( fit, coef="ADdiag2typesAD", genes=genes, assays=ctorder)  
fig.gene_heatmap

figList = lapply(genes, function(g){  
  plotForest( fit, coef = 'ADdiag2typesAD', gene = g, assays=ctorder) +
    theme(aspect.ratio=1, legend.position="none") 
}) 
names(figList) = genes
plot_grid(plotlist=figList, ncol=3)

gene_CT = c("PTPRG" = "Micro_PVM", 'NPNT' = "Astro", 'DPYD'= "Micro_PVM", 'SPRY4-AS1' = "EN_L3_5_IT_3", "PDE10A" = "EN_L6_IT", "NCAM2" = "EN_L3_5_IT_2", 'PCDH7' = "IN_LAMP5")

figList = lapply( names(gene_CT), function(g){

  df = extractData(res.proc, gene_CT[g])

  df = df[,c('Dx', g)]
  colnames(df)[colnames(df)==g] = "expr"

  ggplot(df, aes(Dx, expr, fill=Dx)) +
          geom_boxplot() +
          theme_classic() +
          theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5), legend.position="none") +
          ggtitle(paste0(gene_CT[g], ": ", g)) +
          ylab(bquote(log[2]~CPM)) +
          scale_fill_manual(values=c("grey50", "red3"))
})
names(figList) = names(gene_CT)

plot_grid(plotlist=figList, ncol=3)

Summarize differential expression

# Summarize differential expression for each coef and assay
df_de = fit %>%
  topTable(coef='ADdiag2typesAD', number=Inf) %>%
    as_tibble %>% 
    group_by(assay) %>% 
    summarise( 
      nGenes = length(adj.P.Val), 
      nDE = sum(adj.P.Val < 0.05),
      pi1 = 1 - qvalue(P.Value)$pi0) %>%
  mutate(assay = factor(assay, ctorder))  

ymax = 1.05*max(df_de$nGenes)
fig1 = ggplot(df_de, aes(nGenes, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none") +
    scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
    xlab("# genes expressed") +
    ylab("Cell type") 

ymax = max(1.05*max(df_de$nDE), 100)
fig2 = ggplot(df_de, aes(nDE, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
    scale_x_continuous(limits=c(0,ymax), expand=c(0,0)) +
    xlab("# genes with FDR < 5%") +
    ylab('')

fig3 = ggplot(df_de, aes(pi1, assay, fill=assay)) + 
    geom_bar(stat="identity") + 
    theme_classic() +
    theme(aspect.ratio=1, legend.position="none", axis.text.y=element_blank()) +
    scale_x_continuous(limits=c(0,1), expand=c(0,0)) +
    xlab(bquote(pi[1]))+
    ylab('')

plot_grid(fig1, fig2, fig3, labels=LETTERS[1:3], nrow=1, axis="tblr", align="hv")

fig_de_count = ggplot(df_de, aes(2, assay)) +
        geom_tile(fill="white") +
        geom_point(aes(color=nDE/nGenes*100, size=nGenes)) +
        scale_color_viridis(name = "% genes\npassing FDR", lim=c(0, NA)) +
        scale_size_continuous(name = "# genes", breaks=c(1, 4, 8, 12, 16)*1000) +
        theme_classic() +
        theme(aspect.ratio=as.numeric(nrow(df_de)), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
        xlab('') + ylab('') 
fig_de_count

# Number of cells observed per Subject
df = colData(sce) %>%
  xtabs( ~ Individual + Major_Cell_Type,.) %>%
  as_tibble %>%
  pivot_longer(cols=Individual) %>%
  mutate(assay = Major_Cell_Type) %>%
  group_by(assay) %>%
  summarize(sum.cells = sum(n))

df2 = df_vp %>%
      as_tibble %>%
      mutate(Donor = Dx + sex + Individual + Age) %>%
      group_by(assay) %>%
      summarize(mean.vp = mean(Donor))

df3 = inner_join(df, df2, by='assay') %>%
        inner_join(details(res.proc) %>% as_tibble) %>%
        mutate(assay = factor(assay, ctorder)) 

fit_n.cells = lm(mean.vp ~ log10(sum.cells/n_retain), df3)#, weights=df3$n_retain)
# summary(fit_n.cells)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)

fig.vp_ncells = ggplot(df3, aes(sum.cells/n_retain, mean.vp, color=assay, size=n_retain, label=assay)) +
  geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
  geom_point() +
  scale_x_log10() +
  theme_classic() +
  theme(aspect.ratio=1) + 
  scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
  xlab("Mean # cells per Subject") +
  ylab("Mean % variance across Subject") +
  geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") +
  annotate("text", x=40, y=.7, label=bquote(p==.(pv)))
fig.vp_ncells

df3 = inner_join(df, df_de, by='assay') %>%
        inner_join(details(res.proc) %>% as_tibble) %>%
        mutate(assay = factor(assay, ctorder)) 

fit_n.cells = lm(nDE ~ log10(sum.cells/n_retain), df3)
pv = coef(summary(fit_n.cells))[2,4]
pv = format(pv, digits=2)

fig.nDE_ncells = ggplot(df3, aes(sum.cells/n_retain, nDE, color=assay, size=n_retain, label=assay)) +
  geom_abline(intercept=coef(fit_n.cells)[1], slope=coef(fit_n.cells)[2], color="grey30") +
  geom_point() +
  scale_x_log10() +
  theme_classic() +
  theme(aspect.ratio=1) + 
  scale_size(name="# Subjects", breaks=c(50, 100, 250, 500)) +
  xlab("Mean # nuclei per Subject") +
  ylab("# differentially expressed genes") +
  geom_text_repel(size=3, box.padding =.5, min.segment.length=1, max.overlaps = 15) + guides(color = "none") +
  annotate("text", x=1000, y=.7, label=bquote(p==.(pv)))
fig.nDE_ncells

Gene set analysis

# Load Gene Ontology database 
go.gs = get_GeneOntology(to="SYMBOL")

# remove set with > 1000 genes
go.gs = go.gs[sapply(go.gs, function(x) length(geneIds(x))) < 1000]

# zenith analysis 
res.gsa = zenith_gsa(fit, go.gs, coefs="ADdiag2typesAD")

# remove "GO0022625: " from gene set name
res.gsa$Geneset = gsub("^GO\\S+ ", "", res.gsa$Geneset)
res.gsa$assay = factor(res.gsa$assay, ctorder)  

# plotZenithResults(res.gsa, 10, 3) + theme(legend.position="bottom")

# get genesets with FDR < 5%
gs = unique(res.gsa$Geneset[res.gsa$FDR < 0.05])

# keep only results of these genesets
df = res.gsa[res.gsa$Geneset %in% gs,]

# plot results, but with no limit based on the highest/lowest t-statistic
plotZenithResults(df, Inf, Inf)

All phenotypes

# Scaled is reversed
colData(res.proc)$CERAD = 5 - colData(res.proc)$ceradsc

traits = c("ADdiag2types", "CERAD", 'ADdiag3types', 'braaksc', 'cogdx', "ordered(CERAD)", 'ordered(ADdiag3types)', 'ordered(braaksc)', 'ordered(cogdx)')

form.base = ~ age + sex + Study + pmi 

fitList = lapply( traits, function(trait){
  form = as.formula(paste(as.character(form.base), "+", trait))

  fit = dreamlet(res.proc, form) 
})
names(fitList) = traits

saveRDS(fitList, file="Mathys_Cell_2023_Major_Cell_Type_dreamlet_fit.RDS")
df_all = lapply(names(fitList), function(trait){

  fit = fitList[[trait]]
  coefSet = coefNames(fit)[-seq(5)]

  tab = topTable(fit, coef=coefSet, number=Inf)
  tab$trait = gsub("ordered\\((\\S+)\\)", "\\1", trait)
  tab$coefSet = paste(coefSet, collapse=' + ')
  tab = as_tibble(tab) 

if( length(coefSet) == 1){    
    cols = c("assay", "ID", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "trait", "coefSet")
    tab = tab %>% 
      select(all_of(cols)) %>%
      rename(stat = "t")
  }else{
    cols = c("assay", "ID", "logFC", "AveExpr", "F", "P.Value", "adj.P.Val","trait", "coefSet")
    tab = tab %>% 
      mutate(logFC = NA) %>%
      select(all_of(cols)) %>%
      rename(stat = "F")
  }

}) %>% 
  bind_rows %>%
  arrange(trait, assay, ID, adj.P.Val)

write_parquet(df_all, "Mathys_Cell_2023_Major_Cell_Type_dreamlet.parquet")

Session Info

sessionInfo()

## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-centos7/oneAPI/p_2024.1.0.560/toolkits/mkl/2024.1/lib/libmkl_gf_lp64.so.2;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] formula.tools_1.7.1         arrow_15.0.1               
##  [3] RColorBrewer_1.1-3          lubridate_1.9.3            
##  [5] forcats_1.0.0               stringr_1.5.1              
##  [7] dplyr_1.1.4                 purrr_1.0.2                
##  [9] readr_2.1.5                 tidyr_1.3.1                
## [11] tibble_3.2.1                tidyverse_2.0.0            
## [13] qvalue_2.34.0               viridis_0.6.5              
## [15] viridisLite_0.4.2           cowplot_1.1.3              
## [17] scattermore_1.2             kableExtra_1.4.0           
## [19] knitr_1.45                  ggrepel_0.9.5              
## [21] zenith_1.4.2                GSEABase_1.64.0            
## [23] graph_1.80.0                annotate_1.80.0            
## [25] XML_3.99-0.16.1             AnnotationDbi_1.64.1       
## [27] scater_1.30.1               scuttle_1.12.0             
## [29] dreamlet_1.1.23             variancePartition_1.33.15  
## [31] BiocParallel_1.36.0         limma_3.58.1               
## [33] ggplot2_3.5.1               DelayedArray_0.28.0        
## [35] SparseArray_1.2.4           S4Arrays_1.2.1             
## [37] abind_1.4-5                 Matrix_1.6-5               
## [39] zellkonverter_1.13.3        SingleCellExperiment_1.24.0
## [41] SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [43] GenomicRanges_1.54.1        GenomeInfoDb_1.38.8        
## [45] IRanges_2.36.0              S4Vectors_0.40.2           
## [47] BiocGenerics_0.48.1         MatrixGenerics_1.14.0      
## [49] matrixStats_1.3.0          
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.3             bitops_1.0-7             
##   [3] filelock_1.0.3            basilisk.utils_1.14.1    
##   [5] lifecycle_1.0.4           Rdpack_2.6               
##   [7] mixsqp_0.3-54             edgeR_4.0.16             
##   [9] lattice_0.22-5            MASS_7.3-60.0.1          
##  [11] backports_1.4.1           magrittr_2.0.3           
##  [13] metafor_4.6-0             sass_0.4.9               
##  [15] rmarkdown_2.26            jquerylib_0.1.4          
##  [17] yaml_2.3.8                reticulate_1.36.1        
##  [19] DBI_1.2.2                 minqa_1.2.6              
##  [21] zlibbioc_1.48.2           EnvStats_2.8.1           
##  [23] rmeta_3.0                 msigdbr_7.5.1            
##  [25] RCurl_1.98-1.14           GenomeInfoDbData_1.2.11  
##  [27] pbkrtest_0.5.2            irlba_2.3.5.1            
##  [29] svglite_2.1.3             DelayedMatrixStats_1.24.0
##  [31] codetools_0.2-19          xml2_1.3.6               
##  [33] tidyselect_1.2.1          farver_2.1.2             
##  [35] ScaledMatrix_1.10.0       lme4_1.1-35.2            
##  [37] mathjaxr_1.6-0            jsonlite_1.8.8           
##  [39] BiocNeighbors_1.20.2      iterators_1.0.14         
##  [41] systemfonts_1.1.0         tools_4.3.3              
##  [43] progress_1.2.3            Rcpp_1.0.12              
##  [45] glue_1.7.0                gridExtra_2.3            
##  [47] xfun_0.43                 withr_3.0.0              
##  [49] numDeriv_2016.8-1.1       fastmap_1.2.0            
##  [51] basilisk_1.14.3           boot_1.3-29              
##  [53] fansi_1.0.6               rsvd_1.0.5               
##  [55] caTools_1.18.2            digest_0.6.35            
##  [57] truncnorm_1.0-9           timechange_0.3.0         
##  [59] R6_2.5.1                  colorspace_2.1-0         
##  [61] gtools_3.9.5              RSQLite_2.3.6            
##  [63] RhpcBLASctl_0.23-42       utf8_1.2.4               
##  [65] generics_0.1.3            data.table_1.15.4        
##  [67] corpcor_1.6.10            prettyunits_1.2.0        
##  [69] httr_1.4.7                pkgconfig_2.0.3          
##  [71] gtable_0.3.5              blob_1.2.4               
##  [73] XVector_0.42.0            remaCor_0.0.18           
##  [75] htmltools_0.5.8.1         scales_1.3.0             
##  [77] png_0.1-8                 fANCOVA_0.6-1            
##  [79] ashr_2.2-63               rstudioapi_0.16.0        
##  [81] tzdb_0.4.0                reshape2_1.4.4           
##  [83] nlme_3.1-164              nloptr_2.0.3             
##  [85] cachem_1.1.0              operator.tools_1.6.3     
##  [87] KernSmooth_2.23-22        parallel_4.3.3           
##  [89] vipor_0.4.7               metadat_1.2-0            
##  [91] RcppZiggurat_0.1.6        pillar_1.9.0             
##  [93] grid_4.3.3                vctrs_0.6.5              
##  [95] gplots_3.1.3.1            BiocSingular_1.18.0      
##  [97] beachmat_2.18.1           mashr_0.2.79             
##  [99] xtable_1.8-4              beeswarm_0.4.0           
## [101] Rgraphviz_2.46.0          evaluate_0.23            
## [103] KEGGgraph_1.62.0          invgamma_1.1             
## [105] mvtnorm_1.2-4             cli_3.6.2                
## [107] locfit_1.5-9.9            compiler_4.3.3           
## [109] rlang_1.1.3               crayon_1.5.2             
## [111] SQUAREM_2021.1            labeling_0.4.3           
## [113] plyr_1.8.9                ggbeeswarm_0.7.2         
## [115] stringi_1.8.4             assertthat_0.2.1         
## [117] babelgene_22.9            lmerTest_3.1-3           
## [119] munsell_0.5.1             Biostrings_2.70.3        
## [121] aod_1.3.3                 dir.expiry_1.10.0        
## [123] hms_1.1.3                 sparseMatrixStats_1.14.0 
## [125] bit64_4.0.5               KEGGREST_1.42.0          
## [127] statmod_1.5.0             highr_0.10               
## [129] rbibutils_2.2.16          Rfast_2.1.0              
## [131] broom_1.0.6               memoise_2.0.1            
## [133] RcppParallel_5.1.7        bslib_0.7.0              
## [135] bit_4.0.5                 EnrichmentBrowser_2.32.0

Single-cell atlas reveals correlates of high cognitive function, dementia, and resilience to Alzheimer’s disease pathology

Mathys, et al. Cell (2023)

Developed by Gabriel Hoffman

Run on 2024-05-24 12:44:11.832083

Load packages

Load data

UMAP embedding

Properties of dataset

Process pseudobulk data to estimate precision weights

Show details of processing

Show voom-style mean-variance trends

Variance partitioning analysis

dreamlet

Volcano

Highlight genes

Summarize differential expression

Gene set analysis

All phenotypes

Session Info