library(SingleCellExperiment)
library(zellkonverter)
library(dreamlet)
library(crumblr)
library(aplot)
library(RColorBrewer)
library(ggtree)
library(tidyverse)
library(cowplot)
# read single cell RNA-seq
file = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/covid19_combat/local.h5ad"
sce = readH5AD(file, use_hdf5=TRUE)
counts(sce) = assay(sce, "X")
# remove doublets
sce = sce[,sce$GEX_region != "E: Doublets"]
# remove nan cells
sce = sce[,sce$minor_subset != "nan"]
# remove samples with few cells
totalCells = with(colData(sce), table(donor_id))
keepIds = names(totalCells)[totalCells > 500]
sce = sce[,sce$donor_id %in% keepIds]
# factor for source
sce$Source = factor(sce$Source, c("HV", "COVID_HCW_MILD", "COVID_MILD", "COVID_SEV", "COVID_CRIT", "COVID_LDN", "Sepsis", "Flu"))
UMAP provided with data
fig1 = plotProjection(sce, "X_umap", "major_subset")
fig2 = plotProjection(sce, "X_umap", "minor_subset")
plot_grid(fig1, fig2)

# read metadata
file = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/covid19_combat/CBD-KEY-CLINVAR/COMBAT_CLINVAR_for_processed.txt"
df = read.table(file, header=TRUE)
# filter and merge metadata
df = df[df$scRNASeq_sample_ID %in% sce$scRNASeq_sample_ID,]
idx = match(sce$scRNASeq_sample_ID, df$scRNASeq_sample_ID)
colData(sce) = cbind(colData(sce), df[idx,])
# For each donor, select the most severe sample
res = colData(sce) %>%
as_tibble %>%
group_by(donor_id) %>%
select(scRNASeq_sample_ID, Source) %>%
distinct %>%
summarize(scRNASeq_sample_ID, Source, i = which.max(Source),
use_sample_id = scRNASeq_sample_ID[which.max(Source)])
# res[res$donor_id == "S00109",]
# subset to one sample per donor
sceSub = sce[,sce$scRNASeq_sample_ID %in% droplevels(res$use_sample_id)]
# create pseudobulk
pb <- aggregateToPseudoBulk(sceSub,
assay = "counts",
cluster_id = "minor_subset",
sample_id = "scRNASeq_sample_ID",
verbose = FALSE)
# crumblr transform
cobj = crumblr(cellCounts(pb))
# model repeated measures per donor
form = ~ Age + (1|sex) + (1|Source)
res.vp = fitExtractVarPartModel(cobj, form, colData(pb) )
cols = c(brewer.pal(ncol(res.vp)-1, "Set1"), "grey85")
fig.vp = plotPercentBars(sortCols(res.vp), col=cols)
form = ~ Age + sex + Source
fit = dream(cobj, form, colData(pb))
fit = eBayes(fit)
hc = buildClusterTreeFromPB(pb )
run_test = function(coef){
res = treeTest( fit, cobj, hc, coef=coef)
fig1 = plotTreeTestBeta(res) + theme(legend.position="none") + ggtitle(coef)
fig2 = plotForest(res)
# tab = topTable(fit, coef=coef, number=Inf)
# tab$celltype = factor(rownames(tab), rev(get_taxa_name(fig1)))
# tab$se = with(tab, logFC/t)
# fig2 = ggplot(tab, aes(celltype, logFC)) +
# geom_hline(yintercept=0, linetype="dashed", color="grey", linewidth=1) +
# geom_errorbar(aes(ymin = logFC - 1.96*se, ymax = logFC + 1.96*se), width=0) +
# geom_point(color="dodgerblue") +
# theme_classic() +
# coord_flip() +
# xlab('') +
# ylab("Effect size") +
# theme(axis.text.y=element_blank(),
# axis.ticks.y=element_blank())
# combine plots
fig2 %>% insert_left(fig1) %>% insert_right(fig.vp)
}
# compare to baseline of controls
coef_array = c("SourceCOVID_HCW_MILD", "SourceCOVID_MILD", "SourceCOVID_SEV", "SourceCOVID_CRIT", "SourceSepsis", "SourceFlu")
figList = lapply(coef_array, run_test)
names(figList) = coef_array
figList
## $SourceCOVID_HCW_MILD

##
## $SourceCOVID_MILD

##
## $SourceCOVID_SEV

##
## $SourceCOVID_CRIT

##
## $SourceSepsis

##
## $SourceFlu

# get results from each coefficient
tab = lapply(coef_array, function(coef){
tab = topTable(fit, coef=coef, number=Inf)
tab$coef = coef
tab$celltype = rownames(tab)
tab$se = with(tab, logFC/t)
tab
})
tab = do.call(rbind, tab)
rownames(tab) = c()
tab$adj.P.Val = p.adjust(tab$P.Value, "fdr")
# get order of cell types
res = treeTest( fit, cobj, hc, coef=coef_array[1])
fig1 = plotTreeTest(res) + theme(legend.position="none") + ggtitle(coef)
lvls = rev(get_taxa_name(fig1))
tab$celltype = factor(tab$celltype, lvls)
# Heatmap of all results
tab$coef = factor(gsub("^Source", "", tab$coef), gsub("^Source", "", coef_array))
lim = max(abs(tab$logFC))
ratio = with(tab, length(unique(celltype)) / length(unique(coef))
)
ggplot(tab, aes(coef, celltype, fill=logFC, label=ifelse(adj.P.Val < 0.05, "*", ''))) +
geom_tile() +
geom_text(vjust=1, hjust=0.5) +
theme_classic() +
theme(aspect.ratio=ratio, axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_gradient2(low="blue", mid="white", high="red", limits=c(-lim, lim))

Plot of one cell type
# get CLR values from crumblr, but don't regularize weights
cobj = crumblr(cellCounts(pb), max.ratio=Inf)
library(ggbeeswarm)
library(ggrepel)
# CT = "PB"
# CT = "MAIT"
CT = "cMono.cyc"
# CT = "RET"
# CT = "CD4.TEFF.prolif"
df = data.frame(CLR = cobj$E[CT,],
Source = pb$Source,
se = 1/sqrt(cobj$weights[CT,]),
counts = cellCounts(pb)[,CT],
totalCells = rowSums(cellCounts(pb)),
colData(pb))
# points to highlight
df_count1 = df %>%
filter(counts > 0) %>%
arrange(-se) %>%
head(2) %>%
select(CLR, Source, se, counts, totalCells)
df_count2 = df %>%
arrange(se) %>%
head(2) %>%
select(CLR, Source, se, counts, totalCells)
df_count = rbind(df_count1, df_count2)
n.skip = sum(scale(df$se) > 3)
zmax = sort(df$se, decreasing=TRUE)[n.skip+1]
df %>%
filter(Source != "COVID_LDN") %>%
ggplot(aes(Source, CLR, color=pmin(zmax, se))) +
geom_boxplot(width=.3) +
geom_beeswarm(cex = 2, size=3) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5), aspect.ratio=1, axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
ggtitle(CT) +
scale_color_gradient2(low="grey", high="red", limits= c(0,zmax*1.01), name = "Standard error") +
geom_text_repel(data=df_count, aes(Source, CLR, label=paste(counts, totalCells, sep=' / ')), color="black", box.padding=2)
