In this package, we provide e-value for four DMR (differentially methylated region) detection tools (MethylKit, Metilene, BiSeq and DMRfinder) and general purpose.
methylKit is a R package for DNA methylation analysis and annotation from high-throughput bisulfite sequencing. The package is designed to deal with sequencing data from RRBS and its variants, but also target-capture methods and whole genome bisulfite sequencing.
Currently, evalue
package supports the e-value analytics
of the methylKit
output file.
library(metevalue)
####Simulation Data ####
set.seed(1234)
# methylKit is a Bioconductor package
library(methylKit)
=list( system.file("extdata",
file.list"test1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"test2.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control2.myCpG.txt", package = "methylKit") )
# read the files to a methylRawList object: myobj
=methRead(file.list,
myobjsample.id=list("test1","test2","ctrl1","ctrl2"),
assembly="hg18",
treatment=c(1,1,0,0),
context="CpG"
)
=unite(myobj, destrand=FALSE)
meth<- getData(meth)[,seq(6,ncol(meth),3)]
meth.C <- getData(meth)[,seq(7,ncol(meth),3)]
meth.T <- meth.C/(meth.C + meth.T)
mr = getData(meth)[,1:2]
chr_pos = data.frame(chr_pos,mr)
methyrate names(methyrate) = c('chr', 'pos', rep('g1',2), rep('g2',2))
<-tileMethylCounts(myobj)
region<-unite(region,destrand=F)
meth<-calculateDiffMeth(meth)
myDiff<-getMethylDiff(myDiff,type="all")
met_all
= tempfile(c("rate_combine", "methylKit_DMR_raw"))
example_tempfiles tempdir()
write.table(methyrate, file=example_tempfiles[1], row.names=F, col.names=T, quote=F, sep='\t')
write.table (met_all, file=example_tempfiles[2], sep ="\t", row.names =F, col.names =T, quote =F)
evalue.methylKit
function could be used to tackle the
problem.
= metevalue.methylKit(example_tempfiles[1], example_tempfiles[2], bheader = T)
result str(result)
Alternatively, one could use the build-in functions to derive functions which avoid the file operation:
= evalue_buildin_var_fmt_nm(methyrate, met_all, method="methylKit")
result = list(a = result$a,
result b = result$b,
a_b = evalue_buildin_sql(result$a, result$b, method="methylKit"))
= varevalue.metilene(result$a, result$b, result$a_b)
result str(result)
First, we load the methylation data at CpG site levels from ‘BiSeq’ package. Then we clustered CpG sites into DMRs using ‘BiSeq’.
library(BiSeq)
library(dplyr)
data(rrbs)
<- rawToRel(rrbs)
rrbs.rel <- methLevel(rrbs.rel)
methyrate <- data.frame(methyrate)
methyrate = cbind(rows = as.numeric(row.names(methyrate)), methyrate)
methyrateq = data.frame(rows = as.numeric(row.names(methyrate)), rowRanges(rrbs))
methypos = left_join(methypos, methyrateq)
methyrate = methyrate[,c(2,3,7:16)]
methyrate names(methyrate) <- c('chr','pos',rep('g1',5),rep('g2',5))
<- clusterSites(object = rrbs,perc.samples = 3/4,min.sites = 20,max.dist = 100)
rrbs.clust.unlim
clusterSitesToGR(rrbs.clust.unlim)
<- totalReads(rrbs.clust.unlim) > 0
ind.cov
<- quantile(totalReads(rrbs.clust.unlim)[ind.cov])
quant <- limitCov(rrbs.clust.unlim, maxCov = quant)
rrbs.clust.lim <- predictMeth(object = rrbs.clust.lim)
predictedMeth
<- predictedMeth[, colData(predictedMeth)$group == "test"]
test<- predictedMeth[, colData(predictedMeth)$group == "control"]
control <- rowMeans(methLevel(test))
mean.test <- rowMeans(methLevel(control))
mean.control
<- betaRegression(formula = ~group,link = "probit",object = predictedMeth,type = "BR")
betaResults <- makeVariogram(betaResults)
vario <- smoothVariogram(vario, sill = 0.9)
vario.sm
<- estLocCor(vario.sm)
locCor <- testClusters(locCor)
clusters.rej <- trimClusters(clusters.rej)
clusters.trimmed <- findDMRs(clusters.trimmed,max.dist = 100,diff.dir = TRUE)
DMRs
= tempfile(c('rate_combine', 'BiSeq_DMR'))
example_tempfiles write.table(methyrate, example_tempfiles[1], row.names=F, col.names=T, quote=F, sep='\t')
write.table(DMRs, example_tempfiles[2], quote=F, row.names = F,col.names = F, sep = '\t')
Finally, we added E-values and adjusted E-values as additional
columns to the output file of ‘BiSeq’.BiSeq_evalue
function
could be used to tackle the problem.
<- data.frame(DMRs)
hh = evalue_buildin_var_fmt_nm(rate_combine, hh, method="biseq")
result = list(a = result$a, b = result$b,
result a_b = evalue_buildin_sql(result$a, result$b,method="biseq"))
= varevalue.metilene(result$a, result$b, result$a_b)
result str(result)
Given the input file
rate_combine_DMRfinder
: a file containing
methylation rates at each CpG site
DMRfinder_DMR
: the output file from
‘DMRfinder’
<- read.table("rate_combine_DMRfinder", header = T)
rate_combine head(rate_combine)
<- read.table("DMRfinder_DMR", header = T)
DMRs head(DMRs)
Adding E-values and adjusted E-values as additional columns to file ‘DMRfinder_DMR’
<- evalue.DMRfinder('rate_combine_DMRfinder', 'DMRfinder_DMR')
result head(result)
Alternatively, function varevalue.metilene
can also
provides e-value and adjusted e-value.
= evalue_buildin_var_fmt_nm(rate_combine, DMRs, method="DMRfinder")
result = list(a = result$a,
result b = result$b,
a_b = evalue_buildin_sql(result$a, result$b, method="DMRfinder"))
= varevalue.metilene(result$a, result$b, result$a_b)
result head(result)
Given
metilene.input
: the input file of Metilene
containing methylation rates at each CpG sitemetilene.out
: the output file of
Metilene
<- read.table("metilene.input", header = T)
input head(input)
<- read.table("metilene.out", header = F)
out head(out)
Adding E-values and adjusted E-values as additional columns to
metilene.out
<- evalue.metilene('metilene.input', 'metilene.out')
result head(result)
Alternatively, function varevalue.metilene
can also
provides e-value and adjusted e-value.
= evalue_buildin_var_fmt_nm(input, out, method="metilene")
result = list(a = result$a,
result b = result$b,
a_b = evalue_buildin_sql(result$a, result$b, method="metilene"))
= varevalue.metilene(result$a, result$b, result$a_b)
result head(result)
The following program implement the Figure 1 in
Combining e-values and p-values
by Vovk and Wang. We
modified the original python
code to reproduce the result.
The same vectorization is implemented in the R function
varevalue.metilene
. General purpose e-value calculation
could use this program to archieve better performances.
#### Initialization ####
= 100 # how many seeds to consider
n_seeds = 10000 # the number of trials
N = -0.1 # the parameter of the alternative hypothesis
delta # e_x are the e-values and iE are the cumulative e-values
= rep(0, N)
e_x # p_x are the p-values and FP are Fisher's overall p-values
= rep(0, N)
p_x = matrix(1, nrow = N+1, ncol = n_seeds) # product
iE_all = iE_all # universal
uni_all = iE_all # Fisher
FP_all = iE_all # Fisher-VS
F_VS_all
#### Calculation ####
for(seed in 1:n_seeds){
set.seed(seed * 1e3 + 1)
= rnorm(N) + delta
x = exp(delta * x - delta^2/2)
e_x = cumprod(c(1, e_x))
iE_all[, seed] = cumsum(x)
S = 1:N
nn -1,seed] = exp(S^2/2/nn) / sqrt(nn)
uni_all[= pnorm(x)
p_x = -2 * cumsum(log(p_x))
FF -1, seed] = exp(pchisq(FF, df=2*nn, lower.tail = F, log.p = T))
FP_all[= (FP_all[, seed] < exp(-1))
SELR_ = 1/(-exp(1)*FP_all[ SELR_, seed]*log(FP_all[ SELR_, seed]))
F_VS_all[ SELR_, seed]
}
= apply(iE_all, 1, median)
iE = apply(uni_all, 1, median)
uni = apply(FP_all, 1, median)
FP = apply(F_VS_all,1, median)
F_VS
#### Plot ####
library(ggplot2)
library(tidyr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
<- data.frame(
sim_plots x = 1:(N+1),
product = iE,
universal = uni,
Fisher = 1/(FP),
FisherVS = F_VS
%>%
) gather(key = "Method", value = "E_value", -x) %>%
ggplot(aes(x = x, y = E_value)) +
geom_line(aes(color = Method), size = 1) +
scale_y_continuous(trans='log') +
theme_grey() + # Default
theme(legend.position = "top") +
scale_color_brewer(palette="Dark2") +
xlab("Numer of Observations") +
ylab("e-Value")
print(sim_plots)