Raw data and processing scripts are available from Jaffe, et al. here. This workflow starts with the normalized, QCd data that I posted here



# load gene locations
# this is ENSEMBL v75 from the hg19 assembly
# but other versions and assemblies are available
# login once and then save info
# synLogin("user.name", "password", rememberMe=TRUE)
## Welcome, Gabriel Hoffman!

Load data

response = readRDS( synGet('syn20781704')$path )
metadata = readRDS( synGet('syn20781702')$path )
featureLocation = readRDS( synGet('syn20781701')$path )

# features must have with of at least 2
end(featureLocation) = end(featureLocation) + 1

# Disease must be factor
metadata$Dx = factor(metadata$Dx, c("Control", "Schizo"))

Compute residuals

## Get residuals for decorate
design = model.matrix(~ Dx + Age + Race + negControl_PC1 + negControl_PC2 + negControl_PC3 + negControl_PC4, metadata)

fit = lmFit( response, design)
fit = eBayes( fit )

# # residuals + Dx
# residValues = residuals( fit, response) + coef(fit)[,'DxSchizo']
residValues = residuals( fit, response)

topTable(fit, coef='DxSchizo')
##                   logFC    AveExpr         t      P.Value    adj.P.Val
## cg14703058 -0.010197456 0.06554729 -7.660376 4.638158e-13 1.427984e-07
## cg23959187 -0.018667860 0.10868210 -7.555010 8.935195e-13 1.427984e-07
## cg26168557 -0.014126185 0.06012364 -7.468763 1.522234e-12 1.427984e-07
## cg16017358 -0.016918626 0.07141060 -7.457166 1.634837e-12 1.427984e-07
## cg01949938 -0.014953909 0.10922211 -7.451140 1.696555e-12 1.427984e-07
## cg14890726 -0.010445414 0.11258789 -7.434705 1.876815e-12 1.427984e-07
## cg08376848 -0.013141435 0.10207105 -7.376129 2.686891e-12 1.544102e-07
## cg03091589 -0.007599084 0.08783086 -7.374975 2.705907e-12 1.544102e-07
## cg10471794 -0.019300262 0.11428773 -7.322782 3.719888e-12 1.713379e-07
## cg01244871 -0.012484809 0.07251744 -7.321318 3.753187e-12 1.713379e-07
##                   B
## cg14703058 18.06054
## cg23959187 17.41339
## cg26168557 16.88776
## cg16017358 16.81737
## cg01949938 16.78082
## cg14890726 16.68122
## cg08376848 16.32739
## cg03091589 16.32043
## cg10471794 16.00667
## cg01244871 15.99788

Learn local correlation structure

# Compute correlation and hierarchical clustering on controls
treeList = runOrderedClusteringGenome( residValues[,metadata$Dx=='Control'], featureLocation, method.corr="spearman" )       
Evaluate correlation structure versus distance

# decay in original data
treeListOriginal = runOrderedClusteringGenome( response[,metadata$Dx=='Control'], featureLocation, method.corr="spearman" ) 
dfDistOriginal = evaluateCorrDecay( treeListOriginal, featureLocation, "chr22" )
plotCorrDecay( dfDistOriginal, method="R", xlim=c(500, 1e6), outlierQuantile=1e-5 )
# decay in residuals
dfDist = evaluateCorrDecay( treeList, featureLocation, "chr22" )
plotCorrDecay( dfDist, method="R", xlim=c(500, 1e6), outlierQuantile=1e-5 )
Create clusters and then

Measure strength of correlation structure in each cluster

treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200, 500) )
# get total number of clusters
n_clusters = countClusters( treeListClusters )

# score each cluster to only retain  
# clusters with strong correlation structure
clstScore = scoreClusters(treeList, treeListClusters )
df_LEF = do.call('rbind', clstScore )
df_LEF$id = factor(df_LEF$id, sort(unique(df_LEF$id)))

ggplot(df_LEF, aes(LEF, color=id)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1)

Filter clusters based on strength of correlation

# retain clusters based on filterign criteria
# If lead eigen value fraction (LEF) > 40% then ke/ep clusters
# LEF is the fraction of variance explained by the first eigen-value
clustInclude = retainClusters( clstScore, "LEF", 0.05 )
# get retained clusters  
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )

# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, featureLocation )    
Test differential signal

# get total number of clusters
n_clusters = countClusters( treeListClusters_collapse )

# Evaluate Differential Correlation between two subsets of data
ecdBox = evalDiffCorr( residValues, metadata$Dx, featureLocation, treeListClusters_collapse, npermute, method = "Box.permute", method.corr="spearman")
# Analysis with deltaSLE
# ecdSLE = evalDiffCorr( residValues, metadata$Dx, featureLocation, treeListClusters_collapse, npermute, method = "deltaSLE", method.corr="spearman")

# get summary of results
df = summary( ecdBox )
# print results
##   id chrom cluster       pValue       stat n.perm     p.adjust
## 1 10 chr13    1095 2.692875e-09  4.4348007      0 0.0001541455
## 2 10 chr17    1726 1.792182e-08  0.5843344      0 0.0003907178
## 3 10 chr13    1141 2.047716e-08  0.8910091      0 0.0003907178
## 4 10  chr8    1964 7.578456e-08  2.1475903      0 0.0010400013
## 5 10  chr1    3562 9.084251e-08 -0.0613200      0 0.0010400013
## 6 10  chr2    1280 3.344588e-07 -0.8886028      0 0.0031908480
##   id chrom cluster       pValue       stat n.perm     p.adjust
## 1 10 chr13    1095 2.980768e-09  4.4348007      0 0.0001706251
## 2 10 chr17    1726 1.836351e-08  0.5843344      0 0.0003907178
## 3 10 chr13    1141 2.047716e-08  0.8910091      0 0.0003907178
## 4 10  chr8    1964 4.981476e-08  2.1475903      0 0.0005957367
## 5 10  chr1    3562 5.203668e-08 -0.0613200      0 0.0005957367
## 6 10 chr11    1362 2.125363e-07  1.4246073      0 0.0020276667

Combine results to merge properties of each cluster into a single data.frame

df_results = combineResults( ecdBox, clstScore, treeListClusters, featureLocation)
##   id chrom cluster       pValue       stat n.perm     p.adjust N
## 1 10 chr13    1095 2.980768e-09  4.4348007      0 0.0001706251 9
## 2 10 chr17    1726 1.836351e-08  0.5843344      0 0.0003907178 4
## 3 10 chr13    1141 2.047716e-08  0.8910091      0 0.0003907178 2
## 4 10  chr8    1964 4.981476e-08  2.1475903      0 0.0005957367 5
## 5 10  chr1    3562 5.203668e-08 -0.0613200      0 0.0005957367 3
## 6 10 chr11    1362 2.125363e-07  1.4246073      0 0.0020276667 7
##   mean_abs_corr quantile75 quantile90 quantile95       LEF     start
## 1     0.7749821  0.8199156  0.8839297  0.9011426 0.4560792 114202067
## 2     0.5241504  0.5590907  0.5946782  0.6090513 0.4470460  60142891
## 3     0.8367778  0.8367778  0.8367778  0.8367778 0.7091945 114812177
## 4     0.8240618  0.8510913  0.8816569  0.8881425 0.5299607 145654565
## 5     0.5160377  0.5682601  0.6118153  0.6263337 0.5063517 183604674
## 6     0.6526479  0.7309050  0.7917227  0.8413434 0.4238364  63821400
##         end width
## 1 114204080  2014
## 2  60143447   557
## 3 114812185     9
## 4 145654855   291
## 5 183604790   117
## 6  63828229  6830

Of the 57242 clusters tested, 49 have a adjusted p-value < 0.05. Also, pi1=0.

Summary of cluster properties

# Histogram of LEF
ggplot(df_results, aes(LEF, fill=id)) + geom_histogram(alpha=0.7) + 
  theme_bw(17) + xlim(0,1) + 
  theme(aspect.ratio=1, legend.position="bottom", 
    plot.title = element_text(hjust = 0.5)) + 
  scale_fill_discrete(name = "Requested mean cluster size") + 
  xlab("Lead eigenvalue fraction (LEF)") + 
  ggtitle("Summarize LEF")

# Histogram of mean absolute correlation
ggplot(df_results, aes(mean_abs_corr, fill=id)) + geom_histogram(alpha=0.7) + theme_bw(17) + xlim(0,1) + 
  theme(aspect.ratio=1, legend.position="bottom", 
    plot.title = element_text(hjust = 0.5)) + 
  scale_fill_discrete(name = "Requested mean cluster size") +
   xlab("Mean absolute correlation") + 
   ggtitle("Summarize absolute correlation")

# Boxplot of number of features per cluster
df_results$id = factor(df_results$id, sort(as.numeric(unique(df_results$id))))
ggplot(df_results, aes(id, N, fill=id)) + 
  geom_boxplot() + theme_bw(17) + 
  theme(aspect.ratio=1, legend.position="bottom", 
    plot.title = element_text(hjust = 0.5)) + 
    scale_fill_discrete(name = "Feature per cluster") + 
    xlab("Requested mean cluster size") + 
    ylab("Number of features") + 
    ggtitle("Summarize feature per cluster") + 

Compare correlation structure along genome for top clusters

Pairwise scatter plots

plotScatterPairs( residValues, peakIDs, metadata$Dx) + ggtitle(main)

Compare top cluster between cases and controls

i = 1
peakIDs = getFeaturesInCluster( treeListClusters_collapse, df$chrom[i], df$cluster[i], df$id[i] )

# plot comparison of correlation matrices for peaks in peakIDs
#  where data is subset by metadata$Disease
main = paste0(df$chrom[1], ': cluster ', df$cluster[1])     
plotCompareCorr( residValues, peakIDs, metadata$Dx) + ggtitle(main)    

Examine differential accessability signal for these peaks

topTable(fit, coef='DxSchizo', number=Inf)[peakIDs,]
##                  logFC   AveExpr         t    P.Value adj.P.Val         B
## cg13553936 0.004943551 0.6982189 0.6429464 0.52087691 0.8288788 -8.053151
## cg09715768 0.001418684 0.5026124 0.3480802 0.72808713 0.9178766 -8.199614
## cg16567723 0.021005325 0.3620286 2.4613535 0.01455095 0.1671329 -5.258561
## cg24121069 0.011907812 0.6590228 1.6346402 0.10344447 0.4490382 -6.927107
## cg11312353 0.005529098 0.7171607 1.2297763 0.21999334 0.6117503 -7.503942
## cg12939777 0.010737864 0.7694354 1.4298527 0.15406874 0.5301098 -7.238915
## cg00822277 0.006187138 0.7705241 0.9064247 0.36562647 0.7366940 -7.848846
## cg01383440 0.005039769 0.7552430 0.6810265 0.49651582 0.8168936 -8.027900
## cg07464248 0.010140765 0.8024060 1.9144074 0.05676546 0.3417779 -6.435436

Save results

loc = '/sc/orga/projects/psychencode/gabriel/decorate_analysis/bed/'

# all features
rtracklayer::export.bed( featureLocation, paste0(loc, "methylSz_all.bed"))
# background - only features consider in tests
featureNames_background = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom, clustID=df_results$cluster, id=df_results$id)
featureNames_background = unique(unlist(featureNames_background))
rtracklayer::export.bed( featureLocation[featureNames_background], paste0(loc, "methylSz_background.bed"))
# get significant peaks
idx = which(df_results$p.adjust < 0.05)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( featureLocation[featureNames_signif], paste0(loc, "methylSz_signif.bed"))
# get significant peaks - UP
idx = which(df_results$p.adjust < 0.05 & df_results$stat > 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( featureLocation[featureNames_signif], paste0(loc, "methylSz_signif_up.bed"))
# get significant peaks - DOWN
idx = which(df_results$p.adjust < 0.05 & df_results$stat < 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( featureLocation[featureNames_signif], paste0(loc, "methylSz_signif_down.bed"))