The data files are to big (18Gb) to repost, but they can be find oun the TGCA Genomic Data Commons here

# load gene locations
# this is ENSEMBL v86 from the hg38 assembly
# but other versions and assemblies are available
library(EnsDb.Hsapiens.v86)
# Illumina Methylation 450K data
files = list.files(folder, pattern="*.gdc_hg38.txt.gz", recursive=TRUE, full.names=TRUE)

methl.list = lapply( files, function(file){

  # read file
  df = fread( file )
  colnames(df)[colnames(df) == 'Composite Element REF'] = 'name'

  # reduce memory usage by dropping colums
  if( file != files[1] ){
    df = df[,1:2]
  }

  # get sample id 
  id = gsub(".gdc_hg38.txt.gz", "", basename(file))
  id = unlist(strsplit(id, '\\.'))

  # get folder
  fldr = unlist(strsplit(dirname(file), '/'))

  # Set sample id and folder name
  attr(df, "folder") = fldr[length(fldr)]
  attr(df, "sampleID") = id[length(id)]

  df
})

# merge file sinto single matrix
methylData = Reduce(function(a, b){
    merge(a, b, by = "name", all.x = TRUE, all.y = TRUE)
  }, 
  lapply(methl.list, function(x){
    a = x[,c('name', 'Beta_value')]
    colnames(a)[2] = attr(a, "sampleID")
    a
    } )
  )
probeNames = methylData$name
methylData[,name:=NULL]
methylData = as.data.frame(methylData)
rownames(methylData) = probeNames

# get probe info as GRanges
probeInfo = with(methl.list[[1]], GRanges(Chromosome, IRanges(Start, End, names = name), Feature_Type=Feature_Type, Gene_Symbol=Gene_Symbol, Position_to_TSS=Position_to_TSS))

# Set methylData order to same as probeInfo
idx = match(names(probeInfo), rownames(methylData))
methylData = methylData[idx,]

# idx = match(probeInfo$name, rownames(methylData))
# is.unsorted(idx)
# any(is.na(idx))

# reorder by chromosome position
ord = order(probeInfo)
probeInfo = probeInfo[ord]
methylData = methylData[ord,]

# mean imputation of probes with less than 10 missing samples
countNA = apply(methylData, 1, function(x) sum(is.na(x)))
mu = apply(methylData, 1, function(x) mean(x, na.rm=TRUE))

for(idx in which(countNA > 0 & countNA < 10)){
  # cat("\r", idx, '    ')
  # methylData[idx,][is.na(methylData[idx,])] = mu[idx]

  value = methylData[idx,]
  methylData[idx,is.na(value)] = mu[idx]
}

# filter out probes with NA files
methylData = methylData[which(countNA < 10),]
probeInfo = probeInfo[which(countNA < 10)]

# probe location must be valid
idx = which(start(probeInfo) > 1)
methylData = methylData[idx,]
probeInfo = probeInfo[idx]

# drop seqnames with no entries
chrNameCount = table(seqnames(probeInfo))
probeInfo = dropSeqlevels( probeInfo, names(chrNameCount[chrNameCount==0]))

# get shorter sample names
colnames(methylData) = sapply(strsplit(colnames(methylData), '-'), function(x) paste(x[1:4], collapse='-'))

# sample metadata
info = fread(paste0(folder, "/metadata/sample.tsv"), sep="\t")
# table(info$sample_type)
info = info[sample_type %in% c('Primary Tumor', 'Solid Tissue Normal') & !is.na(sample_type),]
info$sample_type = factor(info$sample_type , c('Solid Tissue Normal', 'Primary Tumor'))
# any(is.na(info$sample_type))

# reorder info based on methylData
ids = intersect(colnames(methylData), info$sample_submitter_id)

info = droplevels(info[match(ids, sample_submitter_id),])
methylData = methylData[,match(ids, colnames(methylData)),]

# identical( info$sample_submitter_id, colnames(methylData))

Compute residuals

# Compute PCA on original dataset
dcmp = svd( t(scale(t(methylData))), nu=2, nv=40)
colnames(dcmp$v) = paste0("PC", 1:ncol(dcmp$v))

# Compute residuals after accounting for PC's
# and add back disease effect
design = model.matrix(~  ., data.frame(Disease = info$sample_type, dcmp$v))
fitFull = lmFit( methylData, design)
fitFull = eBayes(fitFull)
methylDataResiduals = residuals( fitFull, methylData) + coef(fitFull)[,'DiseasePrimary Tumor', drop=FALSE] %*% t(design[,'DiseasePrimary Tumor',drop=FALSE] )
design = model.matrix(~ ., data.frame(Disease = info$sample_type))
fitTest = lmFit( methylData, design)
fitTest = eBayes(fitTest)
df = data.frame(info, dcmp$v[,1:20])

ggplot(df, aes(PC1, PC2, color=sample_type)) + geom_point() + theme_bw(17) + theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) + ggtitle("Original data")

dcmpResid = svd( t(scale(t(methylDataResiduals))), nu=2, nv=2)
colnames(dcmpResid$v) = paste0("PC", 1:ncol(dcmpResid$v))

# df = data.frame(info, dcmpResid$v[,1:2])
# ggplot(df, aes(PC1, PC2, color=sample_type)) + geom_point() + theme_bw(17) + theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) + ggtitle("Residuals")
# rm(dcmpResid)

Learn local correlation structure

# Compute correlation and hierarchical clustering on controls
treeList = runOrderedClusteringGenome( methylDataResiduals[,which(info$sample_type=='Primary Tumor')], probeInfo, method.corr="spearman", quiet=TRUE )      
## 
Evaluating: chr16           
Evaluating: chr3           
Evaluating: chr1           
Evaluating: chr8           
Evaluating: chr14           
Evaluating: chr15           
Evaluating: chr9           
Evaluating: chr19           
Evaluating: chr6           
Evaluating: chr12           
Evaluating: chr2           
Evaluating: chr4           
Evaluating: chr11           
Evaluating: chr20           
Evaluating: chr10           
Evaluating: chr18           
Evaluating: chr21           
Evaluating: chr17           
Evaluating: chr7           
Evaluating: chr22           
Evaluating: chrX           
Evaluating: chr13           
Evaluating: chr5           
Evaluating: chrY

Evaluate correlation structure versus distance

dfDist = evaluateCorrDecay( treeList, probeInfo, "chr22", verbose=FALSE )
  
plotCorrDecay( dfDist, outlierQuantile=1e-5 )

Create clusters and then measure strength of correlation structure in each cluster

treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize=c( 5, 10, 20, 30, 40, 50, 100, 200, 500) )
## Method: meanClusterSize
# get total number of clusters
n_clusters = countClusters( treeListClusters )

# score each cluster to only retain  
# clusters with strong correlation structure
clstScore = scoreClusters(treeList, treeListClusters )
## Evaluating strength of each cluster...
## 
## Dividing work into 216 chunks...
df_LEF = do.call('rbind', clstScore )
df_LEF$id = factor(df_LEF$id, sort(unique(df_LEF$id)))

ggplot(df_LEF, aes(LEF, color=id)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1)

 ggplot(df_LEF, aes(N, LEF, color=id)) + geom_point() + geom_smooth(se=FALSE) + theme_bw(17) + theme(aspect.ratio=1) + xlim(0, 100)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Filter clusters based on strength of correlation

# retain clusters based on filtering criteria
# LEF is the fraction of variance explained by the first eigen-value
clustInclude = retainClusters( clstScore, "LEF", .05 )
    
# get retained clusters  
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )

# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, probeInfo )    

Test differential signal

# get total number of clusters
n_clusters = countClusters( treeListClusters_collapse )

# Evaluate Differential Correlation between two subsets of data
# do enough permutations to pass multiple testing cutoff
param = SnowParam(6, "SOCK", progressbar=TRUE)

resDiffCorr = evalDiffCorr( methylDataResiduals, info$sample_type, probeInfo, treeListClusters_collapse, BPPARAM=param, method="Box.permute", method.corr="spearman") 

# get summary of results
df = summary( resDiffCorr )
 
# print results
head(df)

Combine results to merge properties of each cluster into a single data.frame

df_results = combineResults( resDiffCorr, clstScore, treeListClusters, probeInfo)
## Summarizing analysis...
## Summarizing cluster properties...
## Collecting cluster locations...
## Summarizing each cluster...
## Merging results...
head(df_results[which(df_results$width > 10000),])
##      id chrom cluster        pValue       stat n.perm      p.adjust  N
## 48   50  chr5     308 1.944101e-110  -6.842322      0 3.924168e-107 14
## 70  100  chr6     206  1.692476e-97  -6.999268      0  2.342581e-94 14
## 71   30 chr19     184  4.037522e-97 -13.438510      0  5.509682e-94 33
## 99  100 chr19     182  3.686445e-88  12.377448      0  3.607801e-85 34
## 103 100 chr10      53  2.480567e-87  -5.154243      0  2.333371e-84  8
## 109 100  chr5     123  8.194951e-85  -6.329237      0  7.284334e-82 12
##     mean_abs_corr quantile75 quantile90 quantile95       LEF     start
## 48      0.2786953  0.4282844  0.6502067  0.8463104 0.2318176 150604417
## 70      0.3841944  0.5600910  0.8886664  0.9341039 0.2940485  73417252
## 71      0.3601287  0.5323323  0.6193424  0.6607998 0.2262820  11964786
## 99      0.2128440  0.3212171  0.4715395  0.5453922 0.1546122  56404510
## 103     0.3580671  0.8385230  0.9094727  0.9462639 0.3460754  26441767
## 109     0.6608204  0.9183263  0.9601502  0.9775305 0.4164269 132227402
##           end  width
## 48  150625357  20941
## 70   73452420  35169
## 71   12053170  88385
## 99   56567153 162644
## 103  26567526 125760
## 109 132253860  26459

Of the 96888 clusters tested, 47787 have a adjusted p-value < 0.05. Also, pi1=0.5074277.

Summary of cluster properties

# Histogram of LEF
ggplot(df_results, aes(LEF, fill=id)) + geom_histogram(alpha=0.7) + theme_bw(17) + xlim(0,1) + theme(aspect.ratio=1, legend.position="bottom", plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "Requested mean cluster size") + xlab("Lead eigenvalue fraction (LEF)") + ggtitle("Summarize LEF")

# Histogram of mean absolute correlation
ggplot(df_results, aes(mean_abs_corr, fill=id)) + geom_histogram(alpha=0.7) + theme_bw(17) + xlim(0,1) + theme(aspect.ratio=1, legend.position="bottom", plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "Requested mean cluster size") + xlab("Mean absolute correlation") + ggtitle("Summarize absolute correlation")

# Boxplot of number of features per cluster
df_results$id = factor(df_results$id, sort(as.numeric(unique(df_results$id))))
ggplot(df_results, aes(id, N, fill=id)) + geom_boxplot() + theme_bw(17) + theme(aspect.ratio=1, legend.position="bottom", plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "Feature per cluster") + xlab("Requested mean cluster size") + ylab("Number of features") + ggtitle("Summarize feature per cluster") + coord_flip()

Examine top cluster

idx = 1
# extract peak ID's from most significant cluster
probeIDs = getFeaturesInCluster( treeListClusters_collapse, df$chrom[idx], df$cluster[idx], df$id[idx])
    
main = paste(df$id[idx], df$chrom[idx], df$cluster[idx], sep=' ')

# get location of peaks in this cluster  
query = range(probeInfo[names(probeInfo) %in% probeIDs])

# expand window to include adjacent clusters
window = 1e5 
start(query) = start(query) - window
end(query) = end(query) + window
   
# Plot peaks in the query region
locText = with( data.frame(query), paste0(seqnames, ':', format(start, big.mark=','), '-', format(end, big.mark=',')))

Compare correlation structure along genome

First, plot correlation structure for controls (metadata$Disease==0). Then, plot correlation structure for cases (metadata$Disease==1).

The cluster located at chr11:2,600,569-2,801,310 has a p-value of 8.837e-254 and a test statistic of -26.2. The negative sign indicates a loss of correlation in the test set (i.e. Disease==1) compared to the baseline correlation in controls (i.e. Dissease==0)

# get location of peaks in this cluster  
query = range(probeInfo[names(probeInfo) %in% probeIDs])

# expand window to include adjacent clusters
window = 1e5  
start(query) = start(query) - window
end(query) = end(query) + window

fig1 = plotDecorate( ensdb, treeList, treeListClusters_collapse, probeInfo, query, data=methylDataResiduals[,which(info$sample_type == 'Solid Tissue Normal')])
## Computing correlation from given data...
fig2 = plotDecorate( ensdb, treeList, treeListClusters_collapse, probeInfo, query, data=methylDataResiduals[,which(info$sample_type=='Primary Tumor')])
## Computing correlation from given data...
plot_grid( fig1, fig2, ncol=2, labels=c('A: Solid Tissue Normal', 'B: Primary Tumor') )   

Pairwise scatter plots

ids = c('cg00674706', 'cg13641185')

C1 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[1]]), method="sp")
C2 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[2]]), method="sp")

# plot delta corr, vs magnatude
plotScatterPairs( methylData, ids, info$sample_type) + ggtitle(paste(ids, collapse=', '))

# compare magnitude between cases and controls
topTable(fitFull, coef="DiseasePrimary Tumor", number=Inf)[ids,]
##                   logFC    AveExpr            t   P.Value adj.P.Val
## cg00674706 -0.014023302 0.12916156 -0.251675777 0.8014089 0.9617538
## cg13641185  0.000296362 0.08493228  0.006518203 0.9948022 0.9993187
##                    B
## cg00674706 -7.266899
## cg13641185 -7.298475
idx = seq_len(min(20, length(probeIDs)))
plotScatterPairs( methylData, probeIDs[idx], info$sample_type) + ggtitle(main) 

Compare top cluster between cases and controls

# plot comparison of correlation matrices for peaks in probeIDs
#  where data is subset by metadata$Disease
main = paste0(df$chrom[1], ': cluster ', df$cluster[1])     
plotCompareCorr( methylDataResiduals, probeIDs, info$sample_type) + ggtitle(main)      

Highlight two significant regions

UBD

PFKFB3

ids = names(probeInfo[probeInfo %within% query])

C1 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[1]]), method="sp")
C2 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[2]]), method="sp")
D = abs(C1 - C2)

i = which.max(apply(D, 2, max))
# sort(D[names(i),])

ids = c('cg05686026', 'cg02572278')

plotScatterPairs( methylData, ids, info$sample_type) + ggtitle(main)

Y1 = t(methylData[ids,info$sample_type==levels(info$sample_type)[1]])
Y2 = t(methylData[ids,info$sample_type==levels(info$sample_type)[2]])

Y1 = data.frame(Y1)
Y2 = data.frame(Y2)

Y1$status = levels(info$sample_type)[1]
Y2$status = levels(info$sample_type)[2]

Y_combine = rbind(Y1, Y2)
Y_combine$status = factor(Y_combine$status, levels(info$sample_type))

library(gridExtra)
fig1 = ggplot(Y1, aes(cg05686026, cg02572278)) + geom_point(color="#00ff40") + theme_bw(17) + theme(aspect.ratio=1) + geom_smooth(method="lm", se=FALSE)
fig2 = ggplot(Y2, aes(cg05686026, cg02572278)) + geom_point(color="deepskyblue") + theme_bw(17) + theme(aspect.ratio=1) + geom_smooth(method="lm", se=FALSE, color='black')

fig3 = ggplot(Y_combine, aes(cg05686026, color=status)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1, legend.position='none') + scale_color_manual(values=c("#00ff40", 'deepskyblue'))
fig4 = ggplot(Y_combine, aes(cg02572278, color=status)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1, legend.position='none') + scale_color_manual(values=c("#00ff40", 'deepskyblue'))

grid.arrange(fig1, fig2,fig3, fig4, ncol=4)

# compare magnitude between cases and controls
topTable(fitFull, coef="DiseasePrimary Tumor", number=Inf)[ids,]
##                  logFC    AveExpr         t     P.Value adj.P.Val
## cg05686026  0.10692216 0.60294337  2.653837 0.008245814 0.1987988
## cg02572278 -0.01188886 0.01714413 -3.011320 0.002750636 0.1138163
##                    B
## cg05686026 -3.812806
## cg02572278 -2.820671

Save results

loc = '/sc/orga/projects/psychencode/gabriel/decorate_analysis/bed/'

# all features
rtracklayer::export.bed( probeInfo, paste0(loc, "KIRC_all.bed"))

# background - only features consider in tests
featureNames_background = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom, clustID=df_results$cluster, id=df_results$id)
featureNames_background = unique(unlist(featureNames_background))
rtracklayer::export.bed( probeInfo[featureNames_background], paste0(loc, "KIRC_background.bed"))

# get significant peaks
idx = which(df_results$p.adjust < 0.05)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( probeInfo[featureNames_signif], paste0(loc, "KIRC_signif.bed"))

# get significant peaks - UP
idx = which(df_results$p.adjust < 0.05 & df_results$stat > 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( probeInfo[featureNames_signif], paste0(loc, "KIRC_signif_up.bed"))

# get significant peaks - DOWN
idx = which(df_results$p.adjust < 0.05 & df_results$stat < 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( probeInfo[featureNames_signif], paste0(loc, "KIRC_signif_down.bed"))