Data is available from Synapse

suppressPackageStartupMessages(library(decorate))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(edgeR))
suppressPackageStartupMessages(library(data.table)) 
suppressPackageStartupMessages(library(GenomicRanges)) 

options(xtable.type="html")

knitr::opts_chunk$set(
  echo=TRUE,
  warning=FALSE,
  message=TRUE,
  error = FALSE,
  tidy = FALSE,
  cache = TRUE,
  cache.lazy = FALSE,
  dev = c("png", "pdf"), 
  fig.width=7, fig.height=7)

options(markdown.HTML.stylesheet = 'css/custom.css')

suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(synapser))
# login once and then save info
# synLogin("user.name", "password", rememberMe=TRUE)
synLogin()

## Welcome, Gabriel Hoffman!

## NULL

Download data

# chip-Seq counts
chipCounts = read.table( synGet('syn17061266')$path, header=TRUE, stringsAsFactors=FALSE, sep='\t', row.names=1)

# # peak locations
peakLocs_in = fread( synGet('syn17061264')$path )
peakLocs = with(peakLocs_in, GRanges(V1, IRanges(V2, V3, names=V4)))

meta_clinical = fread( synGet('syn17943350')$path )

meta_atac = fread( synGet('syn20780661')$path )

metaData = merge( meta_atac, meta_clinical, by.x='Individual_ID', by.y='Individual ID')

idx = match( colnames(chipCounts), metaData$Assay_Sample_ID)
metaData = metaData[idx,]
# match( colnames(chipCounts), metaData$Assay_Sample_ID)

# filter by Dx
keep = metaData$Dx %in% c('SCZ', 'Control')
metaData = metaData[keep,]
chipCounts = chipCounts[,keep]
metaData$Dx = factor(metaData$Dx, c("Control", "SCZ"))

Processed data

isexpr = rowSums(cpm(chipCounts)>1) >= 0.2*ncol(chipCounts)
peakLocs2 = peakLocs[which(isexpr)]
 
# Standard usage of limma/voom
countObj = DGEList( chipCounts[isexpr,] )
countObj = calcNormFactors( countObj )
design = model.matrix( ~ as.character(`ATACSeq_report:Sequencing_Order_ID`) + 
`ATACSeq_report:Mean_GC_content`+
`ATACSeq_report:Mapped_Reads` +
`Age of Death` +
`PMI (in hours)` + Sex , metaData)
vobj = voom( countObj, design, plot=TRUE)

# identical( names(peakLocs2), rownames(vobj))

PCA on normalized ATAC-seeq

dcmp = svd(vobj$E, nv=5, nu=0)
frac = dcmp$d^2 / sum(dcmp$d^2) * 100
xlab = paste0('PC1: ', round(frac[1], 1), '%')
ylab = paste0('PC2: ', round(frac[2], 1), '%')
plot(dcmp$v[,1], dcmp$v[,2], xlab=xlab, ylab=ylab, main="Original data")

dsgn = model.matrix( ~ dcmp$v[,1:2] + as.character(`ATACSeq_report:Sequencing_Order_ID`) + 
`ATACSeq_report:Mean_GC_content`+
`ATACSeq_report:Mapped_Reads` +
`Age of Death` +
`PMI (in hours)` + Sex , metaData)
fitPC = lmFit(vobj, dsgn)
quantResid = residuals( fitPC, vobj )

dcmp = svd(quantResid, nv=5, nu=0) 
frac = dcmp$d^2 / sum(dcmp$d^2) * 100
xlab = paste0('PC1: ', round(frac[1], 1), '%')
ylab = paste0('PC2: ', round(frac[2], 1), '%')
plot(dcmp$v[,1], dcmp$v[,2], xlab=xlab, ylab=ylab,main="After regressing out 2 PC's")

Compute residuals to remove effect of confounders

vobj2 = voom( countObj, dsgn, plot=TRUE)

fitPC2 = lmFit(vobj2, dsgn)
quantResid2 = residuals( fitPC2, vobj2 )

Identify differentially accessable peaks

dsgn = model.matrix( ~ dcmp$v[,1:2] + as.character(`ATACSeq_report:Sequencing_Order_ID`) + 
`ATACSeq_report:Mean_GC_content`+
`ATACSeq_report:Mapped_Reads` +
`Age of Death` +
`PMI (in hours)` + Sex + Dx, metaData)

fitDE = lmFit(vobj2, dsgn)

fitDE = eBayes(fitDE)

topTable(fitDE, coef='DxSCZ')

##                  logFC  AveExpr         t      P.Value adj.P.Val         B
## Peak_45807  -0.2576961 2.567049 -4.890847 1.705466e-06 0.2570768 3.8404743
## Peak_28313  -0.3238954 1.208218 -4.869036 1.888237e-06 0.2570768 2.4110078
## Peak_141534  0.1918247 3.100491  4.340537 1.995306e-05 0.5513524 2.1676232
## Peak_94853  -0.1978153 2.643838 -4.301186 2.358040e-05 0.5765739 1.8644447
## Peak_264532 -0.3298884 1.217679 -4.526682 8.909326e-06 0.5513524 1.5276071
## Peak_145100  0.1986419 2.431929  4.121044 4.988301e-05 0.7307329 1.1790804
## Peak_24304   0.3048502 1.159261  4.391580 1.603761e-05 0.5513524 1.0192476
## Peak_241540 -0.2481119 1.628418 -4.167455 4.122603e-05 0.7015974 0.8962095
## Peak_70883  -0.3124643 1.401353 -4.242965 3.012464e-05 0.5918148 0.8896939
## Peak_174088 -0.2732556 1.678998 -4.085587 5.763699e-05 0.7307329 0.6590445

Learn local correlation structure

Here, clutsers are learned from controls only, so we didnt need to residualize out the case/control effect

# Compute correlation and hierarchical clustering on controls
treeList = runOrderedClusteringGenome( quantResid2[,metaData$Dx=='Control'], peakLocs2, quiet=TRUE )

Evaluate correlation structure versus distance

Original data

# decay in original data
treeListOriginal = runOrderedClusteringGenome( vobj$E[,metaData$Dx=='Control'], peakLocs2, quiet=TRUE )

## 
Evaluating: chr1           
Evaluating: chr10           
Evaluating: chr11           
Evaluating: chr12           
Evaluating: chr13           
Evaluating: chr14           
Evaluating: chr15           
Evaluating: chr16           
Evaluating: chr17           
Evaluating: chr18           
Evaluating: chr19           
Evaluating: chr2           
Evaluating: chr20           
Evaluating: chr21           
Evaluating: chr22           
Evaluating: chr3           
Evaluating: chr4           
Evaluating: chr5           
Evaluating: chr6           
Evaluating: chr7           
Evaluating: chr8           
Evaluating: chr9           
Evaluating: chrX           
Evaluating: chrY

dfDistOriginal = evaluateCorrDecay( treeListOriginal, peakLocs2, "chr22", verbose=FALSE )
plotCorrDecay( dfDistOriginal, method="R", xlim=c(500, 1e7) )

Residuals

Notice much lower correlation between distant features after residualizing

# decay in residuals
dfDist = evaluateCorrDecay( treeList, peakLocs2, "chr22", verbose=FALSE )
plotCorrDecay( dfDist, method="R", xlim=c(500, 1e7) )

Create clusters and then measure strength of correlation structure in each cluster

treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200) )

## Method: meanClusterSize

# get total number of clusters
n_clusters = countClusters( treeListClusters )

# score each cluster to only retain  
# clusters with strong correlation structure
clstScore = scoreClusters(treeList, treeListClusters, BPPARAM=SerialParam() )

## Evaluating strength of each cluster...
## 
## Dividing work into 168 chunks...

Histogram of LEF values

df_LEF = lapply(clstScore, function(x){
 with(x, data.frame(id, N, chrom, cluster, LEF))
  })
df_LEF = do.call('rbind', df_LEF )
df_LEF$id = factor(df_LEF$id, sort(unique(df_LEF$id)))

ggplot(df_LEF, aes(LEF, color=id)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1) + xlim(0,1)

Filter clusters based on strength of correlation

# retain clusters based on filterign criteria
# If lead eigen value fraction (LEF) > 40% then ke/ep clusters
# LEF is the fraction of variance explained by the first eigen-value
clustInclude = retainClusters( clstScore, "LEF", 0.1 )

# get retained clusters  
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )

# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, peakLocs2 )

Test differential signal

# get total number of clusters
n_clusters = countClusters( treeListClusters_collapse )

# Evaluate Differential Correlation between two subsets of data
param = SnowParam(6, "SOCK", progressbar=TRUE)

ecdBox = evalDiffCorr( quantResid2, metaData$Dx, peakLocs2, treeListClusters_collapse, BPPARAM=param, method="Box.permute" )

## Note that clusters of 2 or fewer features are omitted from analysis
## 
## # Clusters: 24572 
## Dividing work into 100 chunks...
## Combining results...

# Run deltaSLE test 
# ecdSLE = evalDiffCorr( quantResid2, metaData$Dx, peakLocs2, treeListClusters_collapse, BPPARAM=param, method="deltaSLE" )

# get summary of results
df = summary( ecdBox )
 
# print results
head(df)

##   id chrom cluster       pValue         stat n.perm   p.adjust
## 1 10  chr6     156 3.334935e-06  0.207533143      0 0.04897060
## 2 10  chr2     307 4.013654e-06  0.044223390      0 0.04897060
## 3 10  chr3     917 1.265861e-05 -0.024796473      0 0.09331026
## 4 10 chr13     745 2.097234e-05  0.166882262      0 0.09331026
## 5 10  chr9     256 2.713253e-05  0.001858912      0 0.09331026
## 6 10 chr14     818 2.714494e-05 -0.022771880      0 0.09331026

##   id chrom cluster       pValue         stat n.perm   p.adjust
## 1 10  chr6     156 3.334935e-06  0.207533143      0 0.04897060
## 2 10  chr2     307 4.013654e-06  0.044223390      0 0.04897060
## 3 10  chr3     917 1.265861e-05 -0.024796473      0 0.09331026
## 4 10 chr13     745 2.097234e-05  0.166882262      0 0.09331026
## 5 10  chr9     256 2.713253e-05  0.001858912      0 0.09331026
## 6 10 chr14     818 2.714494e-05 -0.022771880      0 0.09331026

Examine top cluster

# extract peak ID's from most significant cluster
peakIDs = getFeaturesInCluster( treeListClusters_collapse, df$chrom[1], df$cluster[1], df$id[1])
    
# get location of peaks in this cluster  
query = range(peakLocs2[names(peakLocs2) %in% peakIDs])

# expand window to include adjacent clusters
window = 1e5 
start(query) = start(query) - window
end(query) = end(query) + window

library(EnsDb.Hsapiens.v86)      
ensdb = EnsDb.Hsapiens.v86
   
# Plot peaks in the query region
plotDecorate( ensdb, treeList, treeListClusters_collapse, peakLocs2, query, showGenes=TRUE)

Compare top cluster between cases and controls

# plot comparison of correlation matrices for peaks in peakIDs
#  where data is subset by metadata$Disease
main = paste0(df$chrom[1], ': cluster ', df$cluster[1])     
plotCompareCorr( quantResid2, peakIDs, metaData$Dx) + ggtitle(main)

Examine differential accessability signal for these peaks

topTable(fitDE, coef='DxSCZ', number=Inf)[peakIDs,]

##                  logFC   AveExpr         t    P.Value adj.P.Val         B
## Peak_206615  0.1726300 0.6047806  1.842895 0.06641398 0.8896406 -4.049927
## Peak_206616 -0.1414299 0.6456790 -1.681219 0.09384890 0.9052454 -4.212063

df_results = combineResults( ecdBox, clstScore, treeListClusters, peakLocs2)

## Summarizing analysis...
## Summarizing cluster properties...
## Collecting cluster locations...

## Summarizing each cluster...

## Merging results...

head(df_results)

##   id chrom cluster       pValue         stat n.perm   p.adjust N
## 1 10  chr6     156 3.334935e-06  0.207533143      0 0.04897060 2
## 2 10  chr2     307 4.013654e-06  0.044223390      0 0.04897060 2
## 3 10  chr3     917 1.265861e-05 -0.024796473      0 0.09331026 2
## 4 10 chr13     745 2.097234e-05  0.166882262      0 0.09331026 5
## 5 10  chr9     256 2.713253e-05  0.001858912      0 0.09331026 3
## 6 10 chr14     818 2.714494e-05 -0.022771880      0 0.09331026 2
##   mean_abs_corr quantile75 quantile90 quantile95       LEF     start
## 1     0.4523330  0.4523330  0.4523330  0.4523330 0.6130832  18894380
## 2     0.3402438  0.3402438  0.3402438  0.3402438 0.5850610  41598008
## 3     0.2749684  0.2749684  0.2749684  0.2749684 0.5687421  89233591
## 4     0.1341328  0.1690252  0.2666946  0.2962587 0.2555695  75415766
## 5     0.2284657  0.2473242  0.2774834  0.2875364 0.4101318  25323832
## 6     0.2661442  0.2661442  0.2661442  0.2661442 0.5665361 100078844
##         end width
## 1  18904931 10552
## 2  41628966 30959
## 3  89238487  4897
## 4  75429006 13241
## 5  25330987  7156
## 6 100107113 28270

decorate: Local correlation analysis of ATAC-seq

ATAC-seq from post mortem brains from Bryois, et al. (2018)

Developed by Gabriel Hoffman

Run on 2019-09-16