UMAP provided with data

fig1 = plotProjection(sce, "X_umap", "major_subset")
fig2 = plotProjection(sce, "X_umap", "minor_subset")
plot_grid(fig1, fig2)

# read metadata
file = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/covid19_combat/CBD-KEY-CLINVAR/COMBAT_CLINVAR_for_processed.txt"
df = read.table(file, header=TRUE)

# filter and merge metadata
df = df[df$scRNASeq_sample_ID %in% sce$scRNASeq_sample_ID,]
idx = match(sce$scRNASeq_sample_ID, df$scRNASeq_sample_ID)
colData(sce) = cbind(colData(sce), df[idx,])

# For each donor, select the most severe sample
res = colData(sce) %>%
    as_tibble %>%
    group_by(donor_id) %>%
    select(scRNASeq_sample_ID, Source) %>%
    distinct %>%
    summarize(scRNASeq_sample_ID, Source, i = which.max(Source),
        use_sample_id = scRNASeq_sample_ID[which.max(Source)])

# res[res$donor_id == "S00109",]

# subset to one sample per donor
sceSub = sce[,sce$scRNASeq_sample_ID %in% droplevels(res$use_sample_id)]

# create pseudobulk
pb <- aggregateToPseudoBulk(sceSub,
    assay = "counts",     
    cluster_id = "minor_subset",  
    sample_id = "scRNASeq_sample_ID",
    verbose = FALSE)

# crumblr transform
cobj = crumblr(cellCounts(pb))

# model repeated measures per donor
form = ~ Age + (1|sex) + (1|Source)
res.vp = fitExtractVarPartModel(cobj, form, colData(pb) )

cols = c(brewer.pal(ncol(res.vp)-1, "Set1"), "grey85")
fig.vp = plotPercentBars(sortCols(res.vp), col=cols)

form = ~ Age + sex + Source
fit = dream(cobj, form, colData(pb))
fit = eBayes(fit)

hc = buildClusterTreeFromPB(pb )

run_test = function(coef){
    res = treeTest( fit, cobj, hc, coef=coef)

    fig1 = plotTreeTestBeta(res) + theme(legend.position="none") + ggtitle(coef)

    fig2 = plotForest(res)

    # tab = topTable(fit, coef=coef, number=Inf)
    # tab$celltype = factor(rownames(tab), rev(get_taxa_name(fig1)))
    # tab$se = with(tab, logFC/t)

    # fig2 = ggplot(tab, aes(celltype, logFC)) + 
    #   geom_hline(yintercept=0, linetype="dashed", color="grey", linewidth=1) +
    #   geom_errorbar(aes(ymin = logFC - 1.96*se, ymax = logFC + 1.96*se), width=0) +
    #   geom_point(color="dodgerblue") +
    #   theme_classic() +
    #   coord_flip() + 
    #   xlab('') + 
    #   ylab("Effect size") +
    #   theme(axis.text.y=element_blank(),
    #         axis.ticks.y=element_blank())
 
    # combine plots
    fig2 %>% insert_left(fig1) %>% insert_right(fig.vp)
}

# compare to baseline of controls
coef_array = c("SourceCOVID_HCW_MILD", "SourceCOVID_MILD", "SourceCOVID_SEV", "SourceCOVID_CRIT", "SourceSepsis", "SourceFlu")

figList = lapply(coef_array, run_test)
names(figList) = coef_array

figList

## $SourceCOVID_HCW_MILD

## 
## $SourceCOVID_MILD

## 
## $SourceCOVID_SEV

## 
## $SourceCOVID_CRIT

## 
## $SourceSepsis

## 
## $SourceFlu

# get results from each coefficient
tab = lapply(coef_array, function(coef){
    
    tab = topTable(fit, coef=coef, number=Inf)
    tab$coef = coef
    tab$celltype = rownames(tab)
    tab$se = with(tab, logFC/t)

    tab
})
tab = do.call(rbind, tab)
rownames(tab) = c()
tab$adj.P.Val = p.adjust(tab$P.Value, "fdr")

# get order of cell types
res = treeTest( fit, cobj, hc, coef=coef_array[1])
fig1 = plotTreeTest(res) + theme(legend.position="none") + ggtitle(coef)
lvls = rev(get_taxa_name(fig1))
tab$celltype = factor(tab$celltype, lvls)

# Heatmap of all results
tab$coef = factor(gsub("^Source", "", tab$coef), gsub("^Source", "", coef_array))
lim = max(abs(tab$logFC))
ratio = with(tab, length(unique(celltype)) / length(unique(coef))
    )
ggplot(tab, aes(coef, celltype, fill=logFC, label=ifelse(adj.P.Val < 0.05, "*", ''))) +
    geom_tile() +
    geom_text(vjust=1, hjust=0.5) +
    theme_classic() + 
    theme(aspect.ratio=ratio, axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    scale_fill_gradient2(low="blue", mid="white", high="red", limits=c(-lim, lim))

Plot of one cell type

# get CLR values from crumblr, but don't regularize weights
cobj = crumblr(cellCounts(pb), max.ratio=Inf)

library(ggbeeswarm)
library(ggrepel)

# CT = "PB"
# CT = "MAIT"
CT = "cMono.cyc"
# CT = "RET"
# CT = "CD4.TEFF.prolif"
df = data.frame(CLR = cobj$E[CT,],
                Source = pb$Source,
                se = 1/sqrt(cobj$weights[CT,]),
                counts = cellCounts(pb)[,CT],
                totalCells = rowSums(cellCounts(pb)),
                colData(pb))

# points to highlight
df_count1 = df %>%
    filter(counts > 0) %>%
    arrange(-se) %>% 
    head(2) %>%
    select(CLR, Source, se, counts, totalCells)

df_count2 = df %>%
    arrange(se) %>% 
    head(2) %>%
    select(CLR, Source, se, counts, totalCells)

df_count = rbind(df_count1, df_count2)


n.skip = sum(scale(df$se) > 3)
zmax = sort(df$se, decreasing=TRUE)[n.skip+1]

df %>%
    filter(Source != "COVID_LDN") %>%
    ggplot(aes(Source, CLR, color=pmin(zmax, se))) +
        geom_boxplot(width=.3) +
        geom_beeswarm(cex = 2, size=3) + 
        theme_classic() +
        theme(plot.title = element_text(hjust = 0.5), aspect.ratio=1, axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
        ggtitle(CT) +
        scale_color_gradient2(low="grey", high="red", limits= c(0,zmax*1.01), name = "Standard error") +
        geom_text_repel(data=df_count, aes(Source, CLR, label=paste(counts, totalCells, sep=' / ')), color="black", box.padding=2)

COVID COMBAT study

COvid-19 Multi-omics Blood ATlas (COMBAT) Consortium

Developed by Gabriel Hoffman

Run on 2025-01-07 10:19:28.826655

UMAP provided with data

Plot of one cell type