Data is available from Synapse
suppressPackageStartupMessages(library(decorate))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(GenomicRanges))
options(xtable.type="html")
knitr::opts_chunk$set(
echo=TRUE,
warning=FALSE,
message=TRUE,
error = FALSE,
tidy = FALSE,
cache = TRUE,
cache.lazy = FALSE,
dev = c("png", "pdf"),
fig.width=7, fig.height=7)
options(markdown.HTML.stylesheet = 'css/custom.css')
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(synapser))
# login once and then save info
# synLogin("user.name", "password", rememberMe=TRUE)
synLogin()
## Welcome, Gabriel Hoffman!
## NULL
# chip-Seq counts
chipCounts = read.table( synGet('syn17061266')$path, header=TRUE, stringsAsFactors=FALSE, sep='\t', row.names=1)
# # peak locations
peakLocs_in = fread( synGet('syn17061264')$path )
peakLocs = with(peakLocs_in, GRanges(V1, IRanges(V2, V3, names=V4)))
meta_clinical = fread( synGet('syn17943350')$path )
meta_atac = fread( synGet('syn20780661')$path )
metaData = merge( meta_atac, meta_clinical, by.x='Individual_ID', by.y='Individual ID')
idx = match( colnames(chipCounts), metaData$Assay_Sample_ID)
metaData = metaData[idx,]
# match( colnames(chipCounts), metaData$Assay_Sample_ID)
# filter by Dx
keep = metaData$Dx %in% c('SCZ', 'Control')
metaData = metaData[keep,]
chipCounts = chipCounts[,keep]
metaData$Dx = factor(metaData$Dx, c("Control", "SCZ"))
isexpr = rowSums(cpm(chipCounts)>1) >= 0.2*ncol(chipCounts)
peakLocs2 = peakLocs[which(isexpr)]
# Standard usage of limma/voom
countObj = DGEList( chipCounts[isexpr,] )
countObj = calcNormFactors( countObj )
design = model.matrix( ~ as.character(`ATACSeq_report:Sequencing_Order_ID`) +
`ATACSeq_report:Mean_GC_content`+
`ATACSeq_report:Mapped_Reads` +
`Age of Death` +
`PMI (in hours)` + Sex , metaData)
vobj = voom( countObj, design, plot=TRUE)
# identical( names(peakLocs2), rownames(vobj))
dcmp = svd(vobj$E, nv=5, nu=0)
frac = dcmp$d^2 / sum(dcmp$d^2) * 100
xlab = paste0('PC1: ', round(frac[1], 1), '%')
ylab = paste0('PC2: ', round(frac[2], 1), '%')
plot(dcmp$v[,1], dcmp$v[,2], xlab=xlab, ylab=ylab, main="Original data")
dsgn = model.matrix( ~ dcmp$v[,1:2] + as.character(`ATACSeq_report:Sequencing_Order_ID`) +
`ATACSeq_report:Mean_GC_content`+
`ATACSeq_report:Mapped_Reads` +
`Age of Death` +
`PMI (in hours)` + Sex , metaData)
fitPC = lmFit(vobj, dsgn)
quantResid = residuals( fitPC, vobj )
dcmp = svd(quantResid, nv=5, nu=0)
frac = dcmp$d^2 / sum(dcmp$d^2) * 100
xlab = paste0('PC1: ', round(frac[1], 1), '%')
ylab = paste0('PC2: ', round(frac[2], 1), '%')
plot(dcmp$v[,1], dcmp$v[,2], xlab=xlab, ylab=ylab,main="After regressing out 2 PC's")
vobj2 = voom( countObj, dsgn, plot=TRUE)
fitPC2 = lmFit(vobj2, dsgn)
quantResid2 = residuals( fitPC2, vobj2 )
dsgn = model.matrix( ~ dcmp$v[,1:2] + as.character(`ATACSeq_report:Sequencing_Order_ID`) +
`ATACSeq_report:Mean_GC_content`+
`ATACSeq_report:Mapped_Reads` +
`Age of Death` +
`PMI (in hours)` + Sex + Dx, metaData)
fitDE = lmFit(vobj2, dsgn)
fitDE = eBayes(fitDE)
topTable(fitDE, coef='DxSCZ')
## logFC AveExpr t P.Value adj.P.Val B
## Peak_45807 -0.2576961 2.567049 -4.890847 1.705466e-06 0.2570768 3.8404743
## Peak_28313 -0.3238954 1.208218 -4.869036 1.888237e-06 0.2570768 2.4110078
## Peak_141534 0.1918247 3.100491 4.340537 1.995306e-05 0.5513524 2.1676232
## Peak_94853 -0.1978153 2.643838 -4.301186 2.358040e-05 0.5765739 1.8644447
## Peak_264532 -0.3298884 1.217679 -4.526682 8.909326e-06 0.5513524 1.5276071
## Peak_145100 0.1986419 2.431929 4.121044 4.988301e-05 0.7307329 1.1790804
## Peak_24304 0.3048502 1.159261 4.391580 1.603761e-05 0.5513524 1.0192476
## Peak_241540 -0.2481119 1.628418 -4.167455 4.122603e-05 0.7015974 0.8962095
## Peak_70883 -0.3124643 1.401353 -4.242965 3.012464e-05 0.5918148 0.8896939
## Peak_174088 -0.2732556 1.678998 -4.085587 5.763699e-05 0.7307329 0.6590445
Here, clutsers are learned from controls only, so we didnt need to residualize out the case/control effect
# Compute correlation and hierarchical clustering on controls
treeList = runOrderedClusteringGenome( quantResid2[,metaData$Dx=='Control'], peakLocs2, quiet=TRUE )
# decay in original data
treeListOriginal = runOrderedClusteringGenome( vobj$E[,metaData$Dx=='Control'], peakLocs2, quiet=TRUE )
##
Evaluating: chr1
Evaluating: chr10
Evaluating: chr11
Evaluating: chr12
Evaluating: chr13
Evaluating: chr14
Evaluating: chr15
Evaluating: chr16
Evaluating: chr17
Evaluating: chr18
Evaluating: chr19
Evaluating: chr2
Evaluating: chr20
Evaluating: chr21
Evaluating: chr22
Evaluating: chr3
Evaluating: chr4
Evaluating: chr5
Evaluating: chr6
Evaluating: chr7
Evaluating: chr8
Evaluating: chr9
Evaluating: chrX
Evaluating: chrY
dfDistOriginal = evaluateCorrDecay( treeListOriginal, peakLocs2, "chr22", verbose=FALSE )
plotCorrDecay( dfDistOriginal, method="R", xlim=c(500, 1e7) )
Notice much lower correlation between distant features after residualizing
# decay in residuals
dfDist = evaluateCorrDecay( treeList, peakLocs2, "chr22", verbose=FALSE )
plotCorrDecay( dfDist, method="R", xlim=c(500, 1e7) )
treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200) )
## Method: meanClusterSize
# get total number of clusters
n_clusters = countClusters( treeListClusters )
# score each cluster to only retain
# clusters with strong correlation structure
clstScore = scoreClusters(treeList, treeListClusters, BPPARAM=SerialParam() )
## Evaluating strength of each cluster...
##
## Dividing work into 168 chunks...
df_LEF = lapply(clstScore, function(x){
with(x, data.frame(id, N, chrom, cluster, LEF))
})
df_LEF = do.call('rbind', df_LEF )
df_LEF$id = factor(df_LEF$id, sort(unique(df_LEF$id)))
ggplot(df_LEF, aes(LEF, color=id)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1) + xlim(0,1)
# retain clusters based on filterign criteria
# If lead eigen value fraction (LEF) > 40% then ke/ep clusters
# LEF is the fraction of variance explained by the first eigen-value
clustInclude = retainClusters( clstScore, "LEF", 0.1 )
# get retained clusters
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )
# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, peakLocs2 )
# get total number of clusters
n_clusters = countClusters( treeListClusters_collapse )
# Evaluate Differential Correlation between two subsets of data
param = SnowParam(6, "SOCK", progressbar=TRUE)
ecdBox = evalDiffCorr( quantResid2, metaData$Dx, peakLocs2, treeListClusters_collapse, BPPARAM=param, method="Box.permute" )
## Note that clusters of 2 or fewer features are omitted from analysis
##
## # Clusters: 24572
## Dividing work into 100 chunks...
## Combining results...
# Run deltaSLE test
# ecdSLE = evalDiffCorr( quantResid2, metaData$Dx, peakLocs2, treeListClusters_collapse, BPPARAM=param, method="deltaSLE" )
# get summary of results
df = summary( ecdBox )
# print results
head(df)
## id chrom cluster pValue stat n.perm p.adjust
## 1 10 chr6 156 3.334935e-06 0.207533143 0 0.04897060
## 2 10 chr2 307 4.013654e-06 0.044223390 0 0.04897060
## 3 10 chr3 917 1.265861e-05 -0.024796473 0 0.09331026
## 4 10 chr13 745 2.097234e-05 0.166882262 0 0.09331026
## 5 10 chr9 256 2.713253e-05 0.001858912 0 0.09331026
## 6 10 chr14 818 2.714494e-05 -0.022771880 0 0.09331026
## id chrom cluster pValue stat n.perm p.adjust
## 1 10 chr6 156 3.334935e-06 0.207533143 0 0.04897060
## 2 10 chr2 307 4.013654e-06 0.044223390 0 0.04897060
## 3 10 chr3 917 1.265861e-05 -0.024796473 0 0.09331026
## 4 10 chr13 745 2.097234e-05 0.166882262 0 0.09331026
## 5 10 chr9 256 2.713253e-05 0.001858912 0 0.09331026
## 6 10 chr14 818 2.714494e-05 -0.022771880 0 0.09331026
# extract peak ID's from most significant cluster
peakIDs = getFeaturesInCluster( treeListClusters_collapse, df$chrom[1], df$cluster[1], df$id[1])
# get location of peaks in this cluster
query = range(peakLocs2[names(peakLocs2) %in% peakIDs])
# expand window to include adjacent clusters
window = 1e5
start(query) = start(query) - window
end(query) = end(query) + window
library(EnsDb.Hsapiens.v86)
ensdb = EnsDb.Hsapiens.v86
# Plot peaks in the query region
plotDecorate( ensdb, treeList, treeListClusters_collapse, peakLocs2, query, showGenes=TRUE)
# plot comparison of correlation matrices for peaks in peakIDs
# where data is subset by metadata$Disease
main = paste0(df$chrom[1], ': cluster ', df$cluster[1])
plotCompareCorr( quantResid2, peakIDs, metaData$Dx) + ggtitle(main)
topTable(fitDE, coef='DxSCZ', number=Inf)[peakIDs,]
## logFC AveExpr t P.Value adj.P.Val B
## Peak_206615 0.1726300 0.6047806 1.842895 0.06641398 0.8896406 -4.049927
## Peak_206616 -0.1414299 0.6456790 -1.681219 0.09384890 0.9052454 -4.212063
df_results = combineResults( ecdBox, clstScore, treeListClusters, peakLocs2)
## Summarizing analysis...
## Summarizing cluster properties...
## Collecting cluster locations...
## Summarizing each cluster...
## Merging results...
head(df_results)
## id chrom cluster pValue stat n.perm p.adjust N
## 1 10 chr6 156 3.334935e-06 0.207533143 0 0.04897060 2
## 2 10 chr2 307 4.013654e-06 0.044223390 0 0.04897060 2
## 3 10 chr3 917 1.265861e-05 -0.024796473 0 0.09331026 2
## 4 10 chr13 745 2.097234e-05 0.166882262 0 0.09331026 5
## 5 10 chr9 256 2.713253e-05 0.001858912 0 0.09331026 3
## 6 10 chr14 818 2.714494e-05 -0.022771880 0 0.09331026 2
## mean_abs_corr quantile75 quantile90 quantile95 LEF start
## 1 0.4523330 0.4523330 0.4523330 0.4523330 0.6130832 18894380
## 2 0.3402438 0.3402438 0.3402438 0.3402438 0.5850610 41598008
## 3 0.2749684 0.2749684 0.2749684 0.2749684 0.5687421 89233591
## 4 0.1341328 0.1690252 0.2666946 0.2962587 0.2555695 75415766
## 5 0.2284657 0.2473242 0.2774834 0.2875364 0.4101318 25323832
## 6 0.2661442 0.2661442 0.2661442 0.2661442 0.5665361 100078844
## end width
## 1 18904931 10552
## 2 41628966 30959
## 3 89238487 4897
## 4 75429006 13241
## 5 25330987 7156
## 6 100107113 28270