Raw data and processing scripts are available from Jaffe, et al. here. This workflow starts with the normalized, QCd data that I posted here
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(decorate))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(limma))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(BiocParallel))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(qvalue))
options(xtable.type="html")
knitr::opts_chunk$set(
echo=TRUE,
warning=FALSE,
message=TRUE,
error = FALSE,
tidy = FALSE,
cache = TRUE,
cache.lazy = FALSE,
dev = c("png", "pdf"),
fig.width=7, fig.height=7)
options(markdown.HTML.stylesheet = 'css/custom.css')
register(SnowParam(4, "SOCK", progressbar=TRUE))
# load gene locations
# this is ENSEMBL v75 from the hg19 assembly
# but other versions and assemblies are available
library(EnsDb.Hsapiens.v75)
## Loading required package: ensembldb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:stats':
##
## filter
suppressPackageStartupMessages(library(synapser))
# login once and then save info
# synLogin("user.name", "password", rememberMe=TRUE)
synLogin()
## Welcome, Gabriel Hoffman!
## NULL
response = readRDS( synGet('syn20781704')$path )
metadata = readRDS( synGet('syn20781702')$path )
featureLocation = readRDS( synGet('syn20781701')$path )
# features must have with of at least 2
end(featureLocation) = end(featureLocation) + 1
# Disease must be factor
metadata$Dx = factor(metadata$Dx, c("Control", "Schizo"))
## Get residuals for decorate
design = model.matrix(~ Dx + Age + Race + negControl_PC1 + negControl_PC2 + negControl_PC3 + negControl_PC4, metadata)
fit = lmFit( response, design)
fit = eBayes( fit )
# # residuals + Dx
# residValues = residuals( fit, response) + coef(fit)[,'DxSchizo']
residValues = residuals( fit, response)
topTable(fit, coef='DxSchizo')
## logFC AveExpr t P.Value adj.P.Val
## cg14703058 -0.010197456 0.06554729 -7.660376 4.638158e-13 1.427984e-07
## cg23959187 -0.018667860 0.10868210 -7.555010 8.935195e-13 1.427984e-07
## cg26168557 -0.014126185 0.06012364 -7.468763 1.522234e-12 1.427984e-07
## cg16017358 -0.016918626 0.07141060 -7.457166 1.634837e-12 1.427984e-07
## cg01949938 -0.014953909 0.10922211 -7.451140 1.696555e-12 1.427984e-07
## cg14890726 -0.010445414 0.11258789 -7.434705 1.876815e-12 1.427984e-07
## cg08376848 -0.013141435 0.10207105 -7.376129 2.686891e-12 1.544102e-07
## cg03091589 -0.007599084 0.08783086 -7.374975 2.705907e-12 1.544102e-07
## cg10471794 -0.019300262 0.11428773 -7.322782 3.719888e-12 1.713379e-07
## cg01244871 -0.012484809 0.07251744 -7.321318 3.753187e-12 1.713379e-07
## B
## cg14703058 18.06054
## cg23959187 17.41339
## cg26168557 16.88776
## cg16017358 16.81737
## cg01949938 16.78082
## cg14890726 16.68122
## cg08376848 16.32739
## cg03091589 16.32043
## cg10471794 16.00667
## cg01244871 15.99788
# Compute correlation and hierarchical clustering on controls
treeList = runOrderedClusteringGenome( residValues[,metadata$Dx=='Control'], featureLocation, method.corr="spearman" )
##
Evaluating: chr1
Evaluating: chr2
Evaluating: chr3
Evaluating: chr4
Evaluating: chr5
Evaluating: chr6
Evaluating: chr7
Evaluating: chr8
Evaluating: chr9
Evaluating: chr10
Evaluating: chr11
Evaluating: chr12
Evaluating: chr13
Evaluating: chr14
Evaluating: chr15
Evaluating: chr16
Evaluating: chr17
Evaluating: chr18
Evaluating: chr19
Evaluating: chr20
Evaluating: chr21
Evaluating: chr22
# decay in original data
treeListOriginal = runOrderedClusteringGenome( response[,metadata$Dx=='Control'], featureLocation, method.corr="spearman" )
##
Evaluating: chr1
Evaluating: chr2
Evaluating: chr3
Evaluating: chr4
Evaluating: chr5
Evaluating: chr6
Evaluating: chr7
Evaluating: chr8
Evaluating: chr9
Evaluating: chr10
Evaluating: chr11
Evaluating: chr12
Evaluating: chr13
Evaluating: chr14
Evaluating: chr15
Evaluating: chr16
Evaluating: chr17
Evaluating: chr18
Evaluating: chr19
Evaluating: chr20
Evaluating: chr21
Evaluating: chr22
dfDistOriginal = evaluateCorrDecay( treeListOriginal, featureLocation, "chr22" )
##
chr22
plotCorrDecay( dfDistOriginal, method="R", xlim=c(500, 1e6), outlierQuantile=1e-5 )
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# decay in residuals
dfDist = evaluateCorrDecay( treeList, featureLocation, "chr22" )
##
chr22
plotCorrDecay( dfDist, method="R", xlim=c(500, 1e6), outlierQuantile=1e-5 )
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
treeListClusters = createClusters( treeList, method='meanClusterSize', meanClusterSize=c( 10, 20, 30, 40, 50, 100, 200, 500) )
## Method: meanClusterSize
# get total number of clusters
n_clusters = countClusters( treeListClusters )
# score each cluster to only retain
# clusters with strong correlation structure
clstScore = scoreClusters(treeList, treeListClusters )
## Evaluating strength of each cluster...
##
## Dividing work into 176 chunks...
df_LEF = do.call('rbind', clstScore )
df_LEF$id = factor(df_LEF$id, sort(unique(df_LEF$id)))
ggplot(df_LEF, aes(LEF, color=id)) + geom_density() + theme_bw(17) + theme(aspect.ratio=1)
# retain clusters based on filterign criteria
# If lead eigen value fraction (LEF) > 40% then ke/ep clusters
# LEF is the fraction of variance explained by the first eigen-value
clustInclude = retainClusters( clstScore, "LEF", 0.05 )
## Using cutoffs:
## Cluster set cutoff
## 10 0.05
## 20 0.05
## 30 0.05
## 40 0.05
## 50 0.05
## 100 0.05
## 200 0.05
## 500 0.05
# get retained clusters
treeListClusters_filter = filterClusters( treeListClusters, clustInclude )
# Collapse similar clusters
treeListClusters_collapse = collapseClusters( treeListClusters_filter, featureLocation )
## Identifying redundant clusters...
# get total number of clusters
n_clusters = countClusters( treeListClusters_collapse )
# Evaluate Differential Correlation between two subsets of data
ecdBox = evalDiffCorr( residValues, metadata$Dx, featureLocation, treeListClusters_collapse, npermute, method = "Box.permute", method.corr="spearman")
## Note that clusters of 2 or fewer features are omitted from analysis
##
## # Clusters: 61823
## Dividing work into 100 chunks...
## Combining results...
# Analysis with deltaSLE
# ecdSLE = evalDiffCorr( residValues, metadata$Dx, featureLocation, treeListClusters_collapse, npermute, method = "deltaSLE", method.corr="spearman")
# get summary of results
df = summary( ecdBox )
# print results
head(df)
## id chrom cluster pValue stat n.perm p.adjust
## 1 10 chr13 1095 2.692875e-09 4.4348007 0 0.0001541455
## 2 10 chr17 1726 1.792182e-08 0.5843344 0 0.0003907178
## 3 10 chr13 1141 2.047716e-08 0.8910091 0 0.0003907178
## 4 10 chr8 1964 7.578456e-08 2.1475903 0 0.0010400013
## 5 10 chr1 3562 9.084251e-08 -0.0613200 0 0.0010400013
## 6 10 chr2 1280 3.344588e-07 -0.8886028 0 0.0031908480
## id chrom cluster pValue stat n.perm p.adjust
## 1 10 chr13 1095 2.980768e-09 4.4348007 0 0.0001706251
## 2 10 chr17 1726 1.836351e-08 0.5843344 0 0.0003907178
## 3 10 chr13 1141 2.047716e-08 0.8910091 0 0.0003907178
## 4 10 chr8 1964 4.981476e-08 2.1475903 0 0.0005957367
## 5 10 chr1 3562 5.203668e-08 -0.0613200 0 0.0005957367
## 6 10 chr11 1362 2.125363e-07 1.4246073 0 0.0020276667
Combine results to merge properties of each cluster into a single data.frame
df_results = combineResults( ecdBox, clstScore, treeListClusters, featureLocation)
## Summarizing analysis...
## Summarizing cluster properties...
## Collecting cluster locations...
## Summarizing each cluster...
## Merging results...
head(df_results)
## id chrom cluster pValue stat n.perm p.adjust N
## 1 10 chr13 1095 2.980768e-09 4.4348007 0 0.0001706251 9
## 2 10 chr17 1726 1.836351e-08 0.5843344 0 0.0003907178 4
## 3 10 chr13 1141 2.047716e-08 0.8910091 0 0.0003907178 2
## 4 10 chr8 1964 4.981476e-08 2.1475903 0 0.0005957367 5
## 5 10 chr1 3562 5.203668e-08 -0.0613200 0 0.0005957367 3
## 6 10 chr11 1362 2.125363e-07 1.4246073 0 0.0020276667 7
## mean_abs_corr quantile75 quantile90 quantile95 LEF start
## 1 0.7749821 0.8199156 0.8839297 0.9011426 0.4560792 114202067
## 2 0.5241504 0.5590907 0.5946782 0.6090513 0.4470460 60142891
## 3 0.8367778 0.8367778 0.8367778 0.8367778 0.7091945 114812177
## 4 0.8240618 0.8510913 0.8816569 0.8881425 0.5299607 145654565
## 5 0.5160377 0.5682601 0.6118153 0.6263337 0.5063517 183604674
## 6 0.6526479 0.7309050 0.7917227 0.8413434 0.4238364 63821400
## end width
## 1 114204080 2014
## 2 60143447 557
## 3 114812185 9
## 4 145654855 291
## 5 183604790 117
## 6 63828229 6830
Of the 57242 clusters tested, 49 have a adjusted p-value < 0.05. Also, pi1=0.
# Histogram of LEF
ggplot(df_results, aes(LEF, fill=id)) + geom_histogram(alpha=0.7) +
theme_bw(17) + xlim(0,1) +
theme(aspect.ratio=1, legend.position="bottom",
plot.title = element_text(hjust = 0.5)) +
scale_fill_discrete(name = "Requested mean cluster size") +
xlab("Lead eigenvalue fraction (LEF)") +
ggtitle("Summarize LEF")
# Histogram of mean absolute correlation
ggplot(df_results, aes(mean_abs_corr, fill=id)) + geom_histogram(alpha=0.7) + theme_bw(17) + xlim(0,1) +
theme(aspect.ratio=1, legend.position="bottom",
plot.title = element_text(hjust = 0.5)) +
scale_fill_discrete(name = "Requested mean cluster size") +
xlab("Mean absolute correlation") +
ggtitle("Summarize absolute correlation")
# Boxplot of number of features per cluster
df_results$id = factor(df_results$id, sort(as.numeric(unique(df_results$id))))
ggplot(df_results, aes(id, N, fill=id)) +
geom_boxplot() + theme_bw(17) +
theme(aspect.ratio=1, legend.position="bottom",
plot.title = element_text(hjust = 0.5)) +
scale_fill_discrete(name = "Feature per cluster") +
xlab("Requested mean cluster size") +
ylab("Number of features") +
ggtitle("Summarize feature per cluster") +
coord_flip()