Raw data and processing scripts are available from Jaffe, et al. here. This workflow starts with the normalized, QCd data that I posted here

suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(decorate))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(data.table)) 
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(qvalue))

options(xtable.type="html")

knitr::opts_chunk$set(
  echo=TRUE,
  warning=FALSE,
  message=TRUE,
  error = FALSE,
  tidy = FALSE,
  cache = TRUE,
  cache.lazy = FALSE,
  dev = c("png", "pdf"), 
  fig.width=7, fig.height=7)

options(markdown.HTML.stylesheet = 'css/custom.css')

register(SnowParam(4, "SOCK", progressbar=TRUE))

# load gene locations
# this is ENSEMBL v75 from the hg19 assembly
# but other versions and assemblies are available
library(EnsDb.Hsapiens.v75)

## Loading required package: ensembldb

## Loading required package: GenomicFeatures

## Loading required package: AnnotationDbi

## Loading required package: AnnotationFilter

## 
## Attaching package: 'ensembldb'

## The following object is masked from 'package:stats':
## 
##     filter

suppressPackageStartupMessages(library(synapser))
# login once and then save info
# synLogin("user.name", "password", rememberMe=TRUE)
synLogin()

## Welcome, Gabriel Hoffman!

## NULL

Load data

response = readRDS( synGet('syn20781704')$path )
metadata = readRDS( synGet('syn20781702')$path )
featureLocation = readRDS( synGet('syn20781701')$path )

# features must have with of at least 2
end(featureLocation) = end(featureLocation) + 1

# Disease must be factor
metadata$Dx = factor(metadata$Dx, c("Control", "Schizo"))

Compute residuals

## Get residuals for decorate
design = model.matrix(~ Dx + Age + Race + negControl_PC1 + negControl_PC2 + negControl_PC3 + negControl_PC4, metadata)

fit = lmFit( response, design)
fit = eBayes( fit )

# # residuals + Dx
# residValues = residuals( fit, response) + coef(fit)[,'DxSchizo']
residValues = residuals( fit, response)

topTable(fit, coef='DxSchizo')

##                   logFC    AveExpr         t      P.Value    adj.P.Val
## cg14703058 -0.010197456 0.06554729 -7.660376 4.638158e-13 1.427984e-07
## cg23959187 -0.018667860 0.10868210 -7.555010 8.935195e-13 1.427984e-07
## cg26168557 -0.014126185 0.06012364 -7.468763 1.522234e-12 1.427984e-07
## cg16017358 -0.016918626 0.07141060 -7.457166 1.634837e-12 1.427984e-07
## cg01949938 -0.014953909 0.10922211 -7.451140 1.696555e-12 1.427984e-07
## cg14890726 -0.010445414 0.11258789 -7.434705 1.876815e-12 1.427984e-07
## cg08376848 -0.013141435 0.10207105 -7.376129 2.686891e-12 1.544102e-07
## cg03091589 -0.007599084 0.08783086 -7.374975 2.705907e-12 1.544102e-07
## cg10471794 -0.019300262 0.11428773 -7.322782 3.719888e-12 1.713379e-07
## cg01244871 -0.012484809 0.07251744 -7.321318 3.753187e-12 1.713379e-07
##                   B
## cg14703058 18.06054
## cg23959187 17.41339
## cg26168557 16.88776
## cg16017358 16.81737
## cg01949938 16.78082
## cg14890726 16.68122
## cg08376848 16.32739
## cg03091589 16.32043
## cg10471794 16.00667
## cg01244871 15.99788

Learn local correlation structure

# Compute correlation and hierarchical clustering on controls
treeList = runOrderedClusteringGenome( residValues[,metadata$Dx=='Control'], featureLocation, method.corr="spearman" )

## 
Evaluating: chr1           
Evaluating: chr2           
Evaluating: chr3           
Evaluating: chr4           
Evaluating: chr5           
Evaluating: chr6           
Evaluating: chr7           
Evaluating: chr8           
Evaluating: chr9           
Evaluating: chr10           
Evaluating: chr11           
Evaluating: chr12           
Evaluating: chr13           
Evaluating: chr14           
Evaluating: chr15           
Evaluating: chr16           
Evaluating: chr17           
Evaluating: chr18           
Evaluating: chr19           
Evaluating: chr20           
Evaluating: chr21           
Evaluating: chr22

Evaluate correlation structure versus distance

# decay in original data
treeListOriginal = runOrderedClusteringGenome( response[,metadata$Dx=='Control'], featureLocation, method.corr="spearman" )

## 
Evaluating: chr1           
Evaluating: chr2           
Evaluating: chr3           
Evaluating: chr4           
Evaluating: chr5           
Evaluating: chr6           
Evaluating: chr7           
Evaluating: chr8           
Evaluating: chr9           
Evaluating: chr10           
Evaluating: chr11           
Evaluating: chr12           
Evaluating: chr13           
Evaluating: chr14           
Evaluating: chr15           
Evaluating: chr16           
Evaluating: chr17           
Evaluating: chr18           
Evaluating: chr19           
Evaluating: chr20           
Evaluating: chr21           
Evaluating: chr22

dfDistOriginal = evaluateCorrDecay( treeListOriginal, featureLocation, "chr22" )

## 
chr22

plotCorrDecay( dfDistOriginal, method="R", xlim=c(500, 1e6), outlierQuantile=1e-5 )

## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# decay in residuals
dfDist = evaluateCorrDecay( treeList, featureLocation, "chr22" )

## 
chr22

plotCorrDecay( dfDist, method="R", xlim=c(500, 1e6), outlierQuantile=1e-5 )

## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.

## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Create clusters and then

Measure strength of correlation structure in each cluster

treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200, 500) )

## Method: meanClusterSize

# get total number of clusters
n_clusters = countClusters( treeListClusters )

# score each cluster to only retain  
# clusters with strong correlation structure
clstScore = scoreClusters(treeList, treeListClusters )

## Evaluating strength of each cluster...
## 
## Dividing work into 176 chunks...

df_LEF = do.call('rbind', clstScore )
df_LEF$id = factor(df_LEF$id, sort(unique(df_LEF$id)))

ggplot(df_LEF, aes(LEF, color=id)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1)

Filter clusters based on strength of correlation

# retain clusters based on filterign criteria
# If lead eigen value fraction (LEF) > 40% then ke/ep clusters
# LEF is the fraction of variance explained by the first eigen-value
clustInclude = retainClusters( clstScore, "LEF", 0.05 )

## Using cutoffs:
## Cluster set  cutoff
##  10      0.05
##  20      0.05
##  30      0.05
##  40      0.05
##  50      0.05
##  100     0.05
##  200     0.05
##  500     0.05

# get retained clusters  
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )

# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, featureLocation )

## Identifying redundant clusters...

Test differential signal

# get total number of clusters
n_clusters = countClusters( treeListClusters_collapse )

# Evaluate Differential Correlation between two subsets of data
ecdBox = evalDiffCorr( residValues, metadata$Dx, featureLocation, treeListClusters_collapse, npermute, method = "Box.permute", method.corr="spearman")

## Note that clusters of 2 or fewer features are omitted from analysis
## 
## # Clusters: 61823 
## Dividing work into 100 chunks...
## Combining results...

# Analysis with deltaSLE
# ecdSLE = evalDiffCorr( residValues, metadata$Dx, featureLocation, treeListClusters_collapse, npermute, method = "deltaSLE", method.corr="spearman")

# get summary of results
df = summary( ecdBox )
 
# print results
head(df)

##   id chrom cluster       pValue       stat n.perm     p.adjust
## 1 10 chr13    1095 2.692875e-09  4.4348007      0 0.0001541455
## 2 10 chr17    1726 1.792182e-08  0.5843344      0 0.0003907178
## 3 10 chr13    1141 2.047716e-08  0.8910091      0 0.0003907178
## 4 10  chr8    1964 7.578456e-08  2.1475903      0 0.0010400013
## 5 10  chr1    3562 9.084251e-08 -0.0613200      0 0.0010400013
## 6 10  chr2    1280 3.344588e-07 -0.8886028      0 0.0031908480

##   id chrom cluster       pValue       stat n.perm     p.adjust
## 1 10 chr13    1095 2.980768e-09  4.4348007      0 0.0001706251
## 2 10 chr17    1726 1.836351e-08  0.5843344      0 0.0003907178
## 3 10 chr13    1141 2.047716e-08  0.8910091      0 0.0003907178
## 4 10  chr8    1964 4.981476e-08  2.1475903      0 0.0005957367
## 5 10  chr1    3562 5.203668e-08 -0.0613200      0 0.0005957367
## 6 10 chr11    1362 2.125363e-07  1.4246073      0 0.0020276667

Combine results to merge properties of each cluster into a single data.frame

df_results = combineResults( ecdBox, clstScore, treeListClusters, featureLocation)

## Summarizing analysis...
## Summarizing cluster properties...
## Collecting cluster locations...
## Summarizing each cluster...
## Merging results...

head(df_results)

##   id chrom cluster       pValue       stat n.perm     p.adjust N
## 1 10 chr13    1095 2.980768e-09  4.4348007      0 0.0001706251 9
## 2 10 chr17    1726 1.836351e-08  0.5843344      0 0.0003907178 4
## 3 10 chr13    1141 2.047716e-08  0.8910091      0 0.0003907178 2
## 4 10  chr8    1964 4.981476e-08  2.1475903      0 0.0005957367 5
## 5 10  chr1    3562 5.203668e-08 -0.0613200      0 0.0005957367 3
## 6 10 chr11    1362 2.125363e-07  1.4246073      0 0.0020276667 7
##   mean_abs_corr quantile75 quantile90 quantile95       LEF     start
## 1     0.7749821  0.8199156  0.8839297  0.9011426 0.4560792 114202067
## 2     0.5241504  0.5590907  0.5946782  0.6090513 0.4470460  60142891
## 3     0.8367778  0.8367778  0.8367778  0.8367778 0.7091945 114812177
## 4     0.8240618  0.8510913  0.8816569  0.8881425 0.5299607 145654565
## 5     0.5160377  0.5682601  0.6118153  0.6263337 0.5063517 183604674
## 6     0.6526479  0.7309050  0.7917227  0.8413434 0.4238364  63821400
##         end width
## 1 114204080  2014
## 2  60143447   557
## 3 114812185     9
## 4 145654855   291
## 5 183604790   117
## 6  63828229  6830

Of the 57242 clusters tested, 49 have a adjusted p-value < 0.05. Also, pi1=0.

Summary of cluster properties

# Histogram of LEF
ggplot(df_results, aes(LEF, fill=id)) + geom_histogram(alpha=0.7) + 
  theme_bw(17) + xlim(0,1) + 
  theme(aspect.ratio=1, legend.position="bottom", 
    plot.title = element_text(hjust = 0.5)) + 
  scale_fill_discrete(name = "Requested mean cluster size") + 
  xlab("Lead eigenvalue fraction (LEF)") + 
  ggtitle("Summarize LEF")

# Histogram of mean absolute correlation
ggplot(df_results, aes(mean_abs_corr, fill=id)) + geom_histogram(alpha=0.7) + theme_bw(17) + xlim(0,1) + 
  theme(aspect.ratio=1, legend.position="bottom", 
    plot.title = element_text(hjust = 0.5)) + 
  scale_fill_discrete(name = "Requested mean cluster size") +
   xlab("Mean absolute correlation") + 
   ggtitle("Summarize absolute correlation")

# Boxplot of number of features per cluster
df_results$id = factor(df_results$id, sort(as.numeric(unique(df_results$id))))
ggplot(df_results, aes(id, N, fill=id)) + 
  geom_boxplot() + theme_bw(17) + 
  theme(aspect.ratio=1, legend.position="bottom", 
    plot.title = element_text(hjust = 0.5)) + 
    scale_fill_discrete(name = "Feature per cluster") + 
    xlab("Requested mean cluster size") + 
    ylab("Number of features") + 
    ggtitle("Summarize feature per cluster") + 
    coord_flip()

Compare correlation structure along genome for top clusters

Pairwise scatter plots

plotScatterPairs( residValues, peakIDs, metadata$Dx) + ggtitle(main)

Compare top cluster between cases and controls

i = 1
peakIDs = getFeaturesInCluster( treeListClusters_collapse, df$chrom[i], df$cluster[i], df$id[i] )

# plot comparison of correlation matrices for peaks in peakIDs
#  where data is subset by metadata$Disease
main = paste0(df$chrom[1], ': cluster ', df$cluster[1])     
plotCompareCorr( residValues, peakIDs, metadata$Dx) + ggtitle(main)

Examine differential accessability signal for these peaks

topTable(fit, coef='DxSchizo', number=Inf)[peakIDs,]

##                  logFC   AveExpr         t    P.Value adj.P.Val         B
## cg13553936 0.004943551 0.6982189 0.6429464 0.52087691 0.8288788 -8.053151
## cg09715768 0.001418684 0.5026124 0.3480802 0.72808713 0.9178766 -8.199614
## cg16567723 0.021005325 0.3620286 2.4613535 0.01455095 0.1671329 -5.258561
## cg24121069 0.011907812 0.6590228 1.6346402 0.10344447 0.4490382 -6.927107
## cg11312353 0.005529098 0.7171607 1.2297763 0.21999334 0.6117503 -7.503942
## cg12939777 0.010737864 0.7694354 1.4298527 0.15406874 0.5301098 -7.238915
## cg00822277 0.006187138 0.7705241 0.9064247 0.36562647 0.7366940 -7.848846
## cg01383440 0.005039769 0.7552430 0.6810265 0.49651582 0.8168936 -8.027900
## cg07464248 0.010140765 0.8024060 1.9144074 0.05676546 0.3417779 -6.435436

Save results

loc = '/sc/orga/projects/psychencode/gabriel/decorate_analysis/bed/'

# all features
rtracklayer::export.bed( featureLocation, paste0(loc, "methylSz_all.bed"))

# background - only features consider in tests
featureNames_background = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom, clustID=df_results$cluster, id=df_results$id)
featureNames_background = unique(unlist(featureNames_background))
rtracklayer::export.bed( featureLocation[featureNames_background], paste0(loc, "methylSz_background.bed"))

# get significant peaks
idx = which(df_results$p.adjust < 0.05)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( featureLocation[featureNames_signif], paste0(loc, "methylSz_signif.bed"))

# get significant peaks - UP
idx = which(df_results$p.adjust < 0.05 & df_results$stat > 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( featureLocation[featureNames_signif], paste0(loc, "methylSz_signif_up.bed"))

# get significant peaks - DOWN
idx = which(df_results$p.adjust < 0.05 & df_results$stat < 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( featureLocation[featureNames_signif], paste0(loc, "methylSz_signif_down.bed"))

decorate: Local correlation analysis of DNA methylation

DNA methylation data post mortem brain from Jaffe, et al. (2016)

Developed by Gabriel Hoffman

Run on 2019-09-16