The data files are to big (18Gb) to repost, but they can be find oun the TGCA Genomic Data Commons here
# load gene locations
# this is ENSEMBL v86 from the hg38 assembly
# but other versions and assemblies are available
library(EnsDb.Hsapiens.v86)
# Illumina Methylation 450K data
files = list.files(folder, pattern="*.gdc_hg38.txt.gz", recursive=TRUE, full.names=TRUE)
methl.list = lapply( files, function(file){
# read file
df = fread( file )
colnames(df)[colnames(df) == 'Composite Element REF'] = 'name'
# reduce memory usage by dropping colums
if( file != files[1] ){
df = df[,1:2]
}
# get sample id
id = gsub(".gdc_hg38.txt.gz", "", basename(file))
id = unlist(strsplit(id, '\\.'))
# get folder
fldr = unlist(strsplit(dirname(file), '/'))
# Set sample id and folder name
attr(df, "folder") = fldr[length(fldr)]
attr(df, "sampleID") = id[length(id)]
df
})
# merge file sinto single matrix
methylData = Reduce(function(a, b){
merge(a, b, by = "name", all.x = TRUE, all.y = TRUE)
},
lapply(methl.list, function(x){
a = x[,c('name', 'Beta_value')]
colnames(a)[2] = attr(a, "sampleID")
a
} )
)
probeNames = methylData$name
methylData[,name:=NULL]
methylData = as.data.frame(methylData)
rownames(methylData) = probeNames
# get probe info as GRanges
probeInfo = with(methl.list[[1]], GRanges(Chromosome, IRanges(Start, End, names = name), Feature_Type=Feature_Type, Gene_Symbol=Gene_Symbol, Position_to_TSS=Position_to_TSS))
# Set methylData order to same as probeInfo
idx = match(names(probeInfo), rownames(methylData))
methylData = methylData[idx,]
# idx = match(probeInfo$name, rownames(methylData))
# is.unsorted(idx)
# any(is.na(idx))
# reorder by chromosome position
ord = order(probeInfo)
probeInfo = probeInfo[ord]
methylData = methylData[ord,]
# mean imputation of probes with less than 10 missing samples
countNA = apply(methylData, 1, function(x) sum(is.na(x)))
mu = apply(methylData, 1, function(x) mean(x, na.rm=TRUE))
for(idx in which(countNA > 0 & countNA < 10)){
# cat("\r", idx, ' ')
# methylData[idx,][is.na(methylData[idx,])] = mu[idx]
value = methylData[idx,]
methylData[idx,is.na(value)] = mu[idx]
}
# filter out probes with NA files
methylData = methylData[which(countNA < 10),]
probeInfo = probeInfo[which(countNA < 10)]
# probe location must be valid
idx = which(start(probeInfo) > 1)
methylData = methylData[idx,]
probeInfo = probeInfo[idx]
# drop seqnames with no entries
chrNameCount = table(seqnames(probeInfo))
probeInfo = dropSeqlevels( probeInfo, names(chrNameCount[chrNameCount==0]))
# get shorter sample names
colnames(methylData) = sapply(strsplit(colnames(methylData), '-'), function(x) paste(x[1:4], collapse='-'))
# sample metadata
info = fread(paste0(folder, "/metadata/sample.tsv"), sep="\t")
# table(info$sample_type)
info = info[sample_type %in% c('Primary Tumor', 'Solid Tissue Normal') & !is.na(sample_type),]
info$sample_type = factor(info$sample_type , c('Solid Tissue Normal', 'Primary Tumor'))
# any(is.na(info$sample_type))
# reorder info based on methylData
ids = intersect(colnames(methylData), info$sample_submitter_id)
info = droplevels(info[match(ids, sample_submitter_id),])
methylData = methylData[,match(ids, colnames(methylData)),]
# identical( info$sample_submitter_id, colnames(methylData))
# Compute PCA on original dataset
dcmp = svd( t(scale(t(methylData))), nu=2, nv=40)
colnames(dcmp$v) = paste0("PC", 1:ncol(dcmp$v))
# Compute residuals after accounting for PC's
# and add back disease effect
design = model.matrix(~ ., data.frame(Disease = info$sample_type, dcmp$v))
fitFull = lmFit( methylData, design)
fitFull = eBayes(fitFull)
methylDataResiduals = residuals( fitFull, methylData) + coef(fitFull)[,'DiseasePrimary Tumor', drop=FALSE] %*% t(design[,'DiseasePrimary Tumor',drop=FALSE] )
design = model.matrix(~ ., data.frame(Disease = info$sample_type))
fitTest = lmFit( methylData, design)
fitTest = eBayes(fitTest)
df = data.frame(info, dcmp$v[,1:20])
ggplot(df, aes(PC1, PC2, color=sample_type)) + geom_point() + theme_bw(17) + theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) + ggtitle("Original data")
dcmpResid = svd( t(scale(t(methylDataResiduals))), nu=2, nv=2)
colnames(dcmpResid$v) = paste0("PC", 1:ncol(dcmpResid$v))
# df = data.frame(info, dcmpResid$v[,1:2])
# ggplot(df, aes(PC1, PC2, color=sample_type)) + geom_point() + theme_bw(17) + theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) + ggtitle("Residuals")
# rm(dcmpResid)
# Compute correlation and hierarchical clustering on controls
treeList = runOrderedClusteringGenome( methylDataResiduals[,which(info$sample_type=='Primary Tumor')], probeInfo, method.corr="spearman", quiet=TRUE )
##
Evaluating: chr16
Evaluating: chr3
Evaluating: chr1
Evaluating: chr8
Evaluating: chr14
Evaluating: chr15
Evaluating: chr9
Evaluating: chr19
Evaluating: chr6
Evaluating: chr12
Evaluating: chr2
Evaluating: chr4
Evaluating: chr11
Evaluating: chr20
Evaluating: chr10
Evaluating: chr18
Evaluating: chr21
Evaluating: chr17
Evaluating: chr7
Evaluating: chr22
Evaluating: chrX
Evaluating: chr13
Evaluating: chr5
Evaluating: chrY
dfDist = evaluateCorrDecay( treeList, probeInfo, "chr22", verbose=FALSE )
plotCorrDecay( dfDist, outlierQuantile=1e-5 )
treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize=c( 5, 10, 20, 30, 40, 50, 100, 200, 500) )
## Method: meanClusterSize
# get total number of clusters
n_clusters = countClusters( treeListClusters )
# score each cluster to only retain
# clusters with strong correlation structure
clstScore = scoreClusters(treeList, treeListClusters )
## Evaluating strength of each cluster...
##
## Dividing work into 216 chunks...
df_LEF = do.call('rbind', clstScore )
df_LEF$id = factor(df_LEF$id, sort(unique(df_LEF$id)))
ggplot(df_LEF, aes(LEF, color=id)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1)
ggplot(df_LEF, aes(N, LEF, color=id)) + geom_point() + geom_smooth(se=FALSE) + theme_bw(17) + theme(aspect.ratio=1) + xlim(0, 100)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# retain clusters based on filtering criteria
# LEF is the fraction of variance explained by the first eigen-value
clustInclude = retainClusters( clstScore, "LEF", .05 )
# get retained clusters
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )
# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, probeInfo )
# get total number of clusters
n_clusters = countClusters( treeListClusters_collapse )
# Evaluate Differential Correlation between two subsets of data
# do enough permutations to pass multiple testing cutoff
param = SnowParam(6, "SOCK", progressbar=TRUE)
resDiffCorr = evalDiffCorr( methylDataResiduals, info$sample_type, probeInfo, treeListClusters_collapse, BPPARAM=param, method="Box.permute", method.corr="spearman")
# get summary of results
df = summary( resDiffCorr )
# print results
head(df)
Combine results to merge properties of each cluster into a single data.frame
df_results = combineResults( resDiffCorr, clstScore, treeListClusters, probeInfo)
## Summarizing analysis...
## Summarizing cluster properties...
## Collecting cluster locations...
## Summarizing each cluster...
## Merging results...
head(df_results[which(df_results$width > 10000),])
## id chrom cluster pValue stat n.perm p.adjust N
## 48 50 chr5 308 1.944101e-110 -6.842322 0 3.924168e-107 14
## 70 100 chr6 206 1.692476e-97 -6.999268 0 2.342581e-94 14
## 71 30 chr19 184 4.037522e-97 -13.438510 0 5.509682e-94 33
## 99 100 chr19 182 3.686445e-88 12.377448 0 3.607801e-85 34
## 103 100 chr10 53 2.480567e-87 -5.154243 0 2.333371e-84 8
## 109 100 chr5 123 8.194951e-85 -6.329237 0 7.284334e-82 12
## mean_abs_corr quantile75 quantile90 quantile95 LEF start
## 48 0.2786953 0.4282844 0.6502067 0.8463104 0.2318176 150604417
## 70 0.3841944 0.5600910 0.8886664 0.9341039 0.2940485 73417252
## 71 0.3601287 0.5323323 0.6193424 0.6607998 0.2262820 11964786
## 99 0.2128440 0.3212171 0.4715395 0.5453922 0.1546122 56404510
## 103 0.3580671 0.8385230 0.9094727 0.9462639 0.3460754 26441767
## 109 0.6608204 0.9183263 0.9601502 0.9775305 0.4164269 132227402
## end width
## 48 150625357 20941
## 70 73452420 35169
## 71 12053170 88385
## 99 56567153 162644
## 103 26567526 125760
## 109 132253860 26459
Of the 96888 clusters tested, 47787 have a adjusted p-value < 0.05. Also, pi1=0.5074277.
# Histogram of LEF
ggplot(df_results, aes(LEF, fill=id)) + geom_histogram(alpha=0.7) + theme_bw(17) + xlim(0,1) + theme(aspect.ratio=1, legend.position="bottom", plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "Requested mean cluster size") + xlab("Lead eigenvalue fraction (LEF)") + ggtitle("Summarize LEF")
# Histogram of mean absolute correlation
ggplot(df_results, aes(mean_abs_corr, fill=id)) + geom_histogram(alpha=0.7) + theme_bw(17) + xlim(0,1) + theme(aspect.ratio=1, legend.position="bottom", plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "Requested mean cluster size") + xlab("Mean absolute correlation") + ggtitle("Summarize absolute correlation")
# Boxplot of number of features per cluster
df_results$id = factor(df_results$id, sort(as.numeric(unique(df_results$id))))
ggplot(df_results, aes(id, N, fill=id)) + geom_boxplot() + theme_bw(17) + theme(aspect.ratio=1, legend.position="bottom", plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "Feature per cluster") + xlab("Requested mean cluster size") + ylab("Number of features") + ggtitle("Summarize feature per cluster") + coord_flip()
idx = 1
# extract peak ID's from most significant cluster
probeIDs = getFeaturesInCluster( treeListClusters_collapse, df$chrom[idx], df$cluster[idx], df$id[idx])
main = paste(df$id[idx], df$chrom[idx], df$cluster[idx], sep=' ')
# get location of peaks in this cluster
query = range(probeInfo[names(probeInfo) %in% probeIDs])
# expand window to include adjacent clusters
window = 1e5
start(query) = start(query) - window
end(query) = end(query) + window
# Plot peaks in the query region
locText = with( data.frame(query), paste0(seqnames, ':', format(start, big.mark=','), '-', format(end, big.mark=',')))
First, plot correlation structure for controls (metadata$Disease==0). Then, plot correlation structure for cases (metadata$Disease==1).
The cluster located at chr11:2,600,569-2,801,310 has a p-value of 8.837e-254 and a test statistic of -26.2. The negative sign indicates a loss of correlation in the test set (i.e. Disease==1) compared to the baseline correlation in controls (i.e. Dissease==0)
# get location of peaks in this cluster
query = range(probeInfo[names(probeInfo) %in% probeIDs])
# expand window to include adjacent clusters
window = 1e5
start(query) = start(query) - window
end(query) = end(query) + window
fig1 = plotDecorate( ensdb, treeList, treeListClusters_collapse, probeInfo, query, data=methylDataResiduals[,which(info$sample_type == 'Solid Tissue Normal')])
## Computing correlation from given data...
fig2 = plotDecorate( ensdb, treeList, treeListClusters_collapse, probeInfo, query, data=methylDataResiduals[,which(info$sample_type=='Primary Tumor')])
## Computing correlation from given data...
plot_grid( fig1, fig2, ncol=2, labels=c('A: Solid Tissue Normal', 'B: Primary Tumor') )
ids = c('cg00674706', 'cg13641185')
C1 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[1]]), method="sp")
C2 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[2]]), method="sp")
# plot delta corr, vs magnatude
plotScatterPairs( methylData, ids, info$sample_type) + ggtitle(paste(ids, collapse=', '))
# compare magnitude between cases and controls
topTable(fitFull, coef="DiseasePrimary Tumor", number=Inf)[ids,]
## logFC AveExpr t P.Value adj.P.Val
## cg00674706 -0.014023302 0.12916156 -0.251675777 0.8014089 0.9617538
## cg13641185 0.000296362 0.08493228 0.006518203 0.9948022 0.9993187
## B
## cg00674706 -7.266899
## cg13641185 -7.298475
idx = seq_len(min(20, length(probeIDs)))
plotScatterPairs( methylData, probeIDs[idx], info$sample_type) + ggtitle(main)
# plot comparison of correlation matrices for peaks in probeIDs
# where data is subset by metadata$Disease
main = paste0(df$chrom[1], ': cluster ', df$cluster[1])
plotCompareCorr( methylDataResiduals, probeIDs, info$sample_type) + ggtitle(main)
ids = names(probeInfo[probeInfo %within% query])
C1 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[1]]), method="sp")
C2 = cor(t(methylData[ids,info$sample_type==levels(info$sample_type)[2]]), method="sp")
D = abs(C1 - C2)
i = which.max(apply(D, 2, max))
# sort(D[names(i),])
ids = c('cg05686026', 'cg02572278')
plotScatterPairs( methylData, ids, info$sample_type) + ggtitle(main)
Y1 = t(methylData[ids,info$sample_type==levels(info$sample_type)[1]])
Y2 = t(methylData[ids,info$sample_type==levels(info$sample_type)[2]])
Y1 = data.frame(Y1)
Y2 = data.frame(Y2)
Y1$status = levels(info$sample_type)[1]
Y2$status = levels(info$sample_type)[2]
Y_combine = rbind(Y1, Y2)
Y_combine$status = factor(Y_combine$status, levels(info$sample_type))
library(gridExtra)
fig1 = ggplot(Y1, aes(cg05686026, cg02572278)) + geom_point(color="#00ff40") + theme_bw(17) + theme(aspect.ratio=1) + geom_smooth(method="lm", se=FALSE)
fig2 = ggplot(Y2, aes(cg05686026, cg02572278)) + geom_point(color="deepskyblue") + theme_bw(17) + theme(aspect.ratio=1) + geom_smooth(method="lm", se=FALSE, color='black')
fig3 = ggplot(Y_combine, aes(cg05686026, color=status)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1, legend.position='none') + scale_color_manual(values=c("#00ff40", 'deepskyblue'))
fig4 = ggplot(Y_combine, aes(cg02572278, color=status)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1, legend.position='none') + scale_color_manual(values=c("#00ff40", 'deepskyblue'))
grid.arrange(fig1, fig2,fig3, fig4, ncol=4)
# compare magnitude between cases and controls
topTable(fitFull, coef="DiseasePrimary Tumor", number=Inf)[ids,]
## logFC AveExpr t P.Value adj.P.Val
## cg05686026 0.10692216 0.60294337 2.653837 0.008245814 0.1987988
## cg02572278 -0.01188886 0.01714413 -3.011320 0.002750636 0.1138163
## B
## cg05686026 -3.812806
## cg02572278 -2.820671
loc = '/sc/orga/projects/psychencode/gabriel/decorate_analysis/bed/'
# all features
rtracklayer::export.bed( probeInfo, paste0(loc, "KIRC_all.bed"))
# background - only features consider in tests
featureNames_background = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom, clustID=df_results$cluster, id=df_results$id)
featureNames_background = unique(unlist(featureNames_background))
rtracklayer::export.bed( probeInfo[featureNames_background], paste0(loc, "KIRC_background.bed"))
# get significant peaks
idx = which(df_results$p.adjust < 0.05)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( probeInfo[featureNames_signif], paste0(loc, "KIRC_signif.bed"))
# get significant peaks - UP
idx = which(df_results$p.adjust < 0.05 & df_results$stat > 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( probeInfo[featureNames_signif], paste0(loc, "KIRC_signif_up.bed"))
# get significant peaks - DOWN
idx = which(df_results$p.adjust < 0.05 & df_results$stat < 0)
featureNames_signif = getFeaturesInClusterList( treeListClusters_collapse, chrom=df_results$chrom[idx], clustID=df_results$cluster[idx], id=df_results$id[idx])
featureNames_signif = unique(unlist(featureNames_signif))
rtracklayer::export.bed( probeInfo[featureNames_signif], paste0(loc, "KIRC_signif_down.bed"))