This document describes an estimate of the positive predictive value (PPV) of variant discovery for all variants in the gene KCNH2 on long QT syndrome type 2 (LQT2). Additional details on the methods used are published Kroncke et al. 2020 PLOS Genetics and at the following website: (https://oates.app.vumc.org/vancart/SCN5A/SCN5A-report.html). We use observed and estimated probability of LQT2 diagnosis for all known KCNH2 variants as a way to assess the per variant PPV for variant discovery. Our objective is to develope a prior estimate of the per variant PPV on LQT2 which incorporates structure, function, and in silico predictors. We use these in silico and in vitro data to generate a Bayesian prior estimate of the per variant PPV since these data can be generated in a lab setting, unlike heterozygotes/carriers of KCNH2 variants which may or may not exist. The final posterior estimate combines this derived prior and clinically phenotyped heterozygotes/carriers.
# mut_type has includes type and isoform for all variants in the literature and in the cohort.
<- mut_type[mut_type$mut_type == "missense",]
mut_type
# Cohort carrier/heterozygote counts and variant IDs.
# Here we select only the missense variants (in-frame indels are also included as "missense")
<- cohort.data[cohort.data$mut_type == "missense" & cohort.data$total_carriers > 0,]
cohort.data
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
<- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]
d
# Here, heterozygotes/carriers from gnomAD are removed to test their influence on performance
# Remove comments in next two lines and complete subsequent evaluation to assess influence of gnomAD on
# calculations.
#d$total_carriers <- d$total_carriers - d$gnomAD # test influence of gnomAD
#d$unaff <- d$total_carriers - d$lqt2 # test influence of gnomAD
# set initial weighting and penetrance
$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" when evaluating ROC-AUC of n=1 variants from the literature d[d
Use observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot probability density versus residue
# Mean squared error
<- function(sm) {
mse mean((sm$residuals)^2*(sm$weights))
}
# Weighted mean to determine LQT2 penetrance empirical prior
= data.frame(wt=1)
newdata <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
model summary(model)
##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.2329 -0.1653 -0.0232 0.0763 0.7561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2332 0.0135 17.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2323 on 789 degrees of freedom
## (467 observations deleted due to missingness)
<-predict(model, newdata)
p<- mse(model)#p*(1-p)
dev
# Derive alpha and beta from weighted mean and MSE (estimated variance)
<- function(mu, var) {
estBetaParams <- ((1 - mu) / var - 1 / mu) * mu ^ 2
alpha <- alpha * (1 / mu - 1)
beta return(params = list(alpha = alpha, beta = beta))
}
# Estimated shape parameters for LQT2 empirical prior
= estBetaParams(p,dev)$alpha
alpha0 = estBetaParams(p,dev)$beta
beta0 print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))
## [1] "alpha0 = 0.54041284982238 beta0 = 1.77720039413882"
# Bayesian LQT2 penetrance estimates from empirical priors
# and observed affected/unaffected counts:
$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial
d
# Plot literature observed LQT2 penetrance versus residue number
<- d %>%
mselect(resnum, pmean = penetrance_lqt2) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
<- loess(d[,"penetrance_lqt2"]~as.numeric(d[,"resnum"]), span = 0.15)
fit plot(d$resnum, d$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 Penetrance Estimate")
<- seq(min(fit$x), max(fit$x), length.out = 100)
xrange <- predict(fit, xrange, se=T)
ps lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)
With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.
# NOTE: adjust 3rd argument given to funcdist, "d,", to d[d$total_carriers>1,] when
# evaluating ROC of total_carriers == 1
"lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
h2.covariates[, <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
mut_type for(rec in 1:nrow(mut_type)){
if (!is.na(mut_type[rec, "resnum"])) {
<-length(h2.covariates[h2.covariates$var == mut_type[rec, "var"],"var"])
lfor (m in 1:l){
$var == mut_type[rec, "var"],
h2.covariates[h2.covariatesc("lqt2_dist", "lqt2_dist_weight", "resnum")][m,] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d, h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
}
}
}
# Plot lqt2_dist versus residue number
$resnum<-as.integer(h2.covariates$resnum)
h2.covariates<- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
h2.covariates <- h2.covariates %>%
mselect(resnum, pmean = lqt2_dist) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
<- loess(h2.covariates[,"lqt2_dist"]~as.numeric(h2.covariates[,"resnum"]), span = 0.15)
fit plot(h2.covariates$resnum, h2.covariates$lqt2_dist, xlab ="Residue", ylab = "LQT2 Penetrance Density", xlim=c(0,1160))
<- seq(min(fit$x), max(fit$x), length.out = 100)
xrange <- predict(fit, xrange, se=T)
ps lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)
Evaluate weighted Spearman correlations coefficients between observed LQT2 diagnosis probability in the literature and various potential predictors
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 diagnosis probability for all variants including
# those witheld during model construction. This will make validation easier.
<- merge(d, h2.covariates, all = TRUE)
d <- unique(d)
d
=function(xName,yName,weightName,nPerms,new.mat2){
calcPval# Pulls out variables
=new.mat2[,xName]
x=new.mat2[,yName]
y=new.mat2[,weightName]
w=x[!is.na(x)]
x2=y[!is.na(x)]
y2=w[!is.na(x)]
w2
# Calculate the real correlation
=weightedCorr(x2,y2,method='spearman',weights=w2)
realCorr# Do permutations, calculate fake correlations
=c()
permutedCorrListfor(permNum in 1:nPerms){
=sample(x2,length(x2),replace=FALSE)
permutedX=weightedCorr(permutedX,y2,method='spearman',weights=w2)
wCorrSim=c(permutedCorrList,wCorrSim)
permutedCorrList
}=abs(permutedCorrList)
permutedCorrList2=abs(realCorr)
realCorr2
# Calculate pvalue
=sum(realCorr2<permutedCorrList2)
summ=summ/nPerms
pValuereturn(list(realCorr,pValue,length(x2)))
}
=function(yList,xList,nPerms,weightName,new.mat2){
calcAllPvals=0
i=data.frame()
resultTablefor(yName in yList){
for(xName in xList){
=i+1
i=calcPval(xName,yName,weightName,nPerms,new.mat2)
result'x']=xName
resultTable[i,'y']=yName
resultTable[i,'nPerms']=nPerms
resultTable[i,'weightedCorr']=result[[1]]
resultTable[i,'pValue']=result[[2]]
resultTable[i,'n']=result[[3]]
resultTable[i,#print(resultTable[i,'pValue'])
}
}print(resultTable)
return(resultTable)
}
=c('penetrance_lqt2')
yList=c('hm_ssPeak','hm_tailPeak','hm_vhalfact','hm_vhalfinact','hm_recovfrominact', 'hm_taudeact_fast',
xList'ht_ssPeak','ht_tailPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact', 'ht_taudeact_fast',
'pph2_prob', 'provean_score', "blast_pssm",
'pamscore', 'aasimilaritymat', "lqt2_dist", "revel_score")
<-d[!is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
tmp<-calcAllPvals(yList, xList, 1000, 'weight', tmp) resultTable
## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.319565106 0.264 20
## 2 hm_tailPeak penetrance_lqt2 1000 -0.564758635 0.000 100
## 3 hm_vhalfact penetrance_lqt2 1000 0.147612664 0.293 68
## 4 hm_vhalfinact penetrance_lqt2 1000 0.669666858 0.001 29
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.734693865 0.007 12
## 6 hm_taudeact_fast penetrance_lqt2 1000 -0.375296727 0.025 42
## 7 ht_ssPeak penetrance_lqt2 1000 -0.464626144 0.025 34
## 8 ht_tailPeak penetrance_lqt2 1000 -0.616133062 0.000 83
## 9 ht_vhalfact penetrance_lqt2 1000 -0.061932733 0.710 41
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.173738913 0.475 26
## 11 ht_recovfrominact penetrance_lqt2 1000 -0.407537689 0.407 6
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.009703229 0.975 14
## 13 pph2_prob penetrance_lqt2 1000 0.392919452 0.000 741
## 14 provean_score penetrance_lqt2 1000 -0.585053878 0.000 741
## 15 blast_pssm penetrance_lqt2 1000 -0.250452543 0.000 741
## 16 pamscore penetrance_lqt2 1000 -0.116758403 0.033 750
## 17 aasimilaritymat penetrance_lqt2 1000 0.018401459 0.719 750
## 18 lqt2_dist penetrance_lqt2 1000 0.583417065 0.000 748
## 19 revel_score penetrance_lqt2 1000 0.664409462 0.000 750
<-d[!is.na(d$ht_tailPeak) & !is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
tmp<-calcAllPvals(yList, xList, 1000, 'weight', tmp) resultTable
## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.690504615 0.097 10
## 2 hm_tailPeak penetrance_lqt2 1000 -0.428867090 0.004 66
## 3 hm_vhalfact penetrance_lqt2 1000 -0.093938286 0.659 35
## 4 hm_vhalfinact penetrance_lqt2 1000 0.629956576 0.049 15
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.772690799 0.294 4
## 6 hm_taudeact_fast penetrance_lqt2 1000 -0.228864954 0.368 17
## 7 ht_ssPeak penetrance_lqt2 1000 -0.464626144 0.022 34
## 8 ht_tailPeak penetrance_lqt2 1000 -0.616133062 0.000 83
## 9 ht_vhalfact penetrance_lqt2 1000 -0.061096836 0.748 40
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.173348992 0.438 25
## 11 ht_recovfrominact penetrance_lqt2 1000 -0.407537689 0.424 6
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.009703229 0.980 14
## 13 pph2_prob penetrance_lqt2 1000 0.332107233 0.008 81
## 14 provean_score penetrance_lqt2 1000 -0.455185289 0.000 81
## 15 blast_pssm penetrance_lqt2 1000 0.055910704 0.653 81
## 16 pamscore penetrance_lqt2 1000 0.058560727 0.655 83
## 17 aasimilaritymat penetrance_lqt2 1000 0.063049344 0.610 83
## 18 lqt2_dist penetrance_lqt2 1000 0.755839461 0.000 83
## 19 revel_score penetrance_lqt2 1000 0.612183017 0.000 83
Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
<- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]
d
# set initial weighting and penetrance
$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d$total_carriers < 2,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literature d[d
Use observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot LQTS probability density versus residue
# Weighted mean to determine LQT2 penetrance empirical prior
= data.frame(wt=1)
newdata <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
model summary(model)
##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.2323 -0.1648 0.0000 0.0000 0.7567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.23253 0.01893 12.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3236 on 399 degrees of freedom
## (467 observations deleted due to missingness)
<-predict(model, newdata)
p<- mse(model) #p*(1-p)
dev
# Estimated shape parameters for LQT2 empirical prior
= estBetaParams(p,dev)$alpha
alpha0 = estBetaParams(p,dev)$beta
beta0 print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))
## [1] "alpha0 = 0.552007535426229 beta0 = 1.82190517994969"
# Bayesian LQT2 penetrance estimates from empirical priors
# and observed affected/unaffected counts:
$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial d
With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.
"lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
h2.covariates[, <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
mut_type for(rec in 1:nrow(mut_type)){
if (!is.na(mut_type[rec, "resnum"])) {
$var == mut_type[rec, "var"],
h2.covariates[h2.covariatesc("lqt2_dist", "lqt2_dist_weight", "resnum")] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d[d$total_carriers>1,], h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
}
}
# Plot lqt2_dist versus residue number
$resnum<-as.integer(h2.covariates$resnum)
h2.covariates<- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
h2.covariates
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 penetrance for all variants including
# those witheld during model construction. This will make validation easier.
<- merge(d, h2.covariates, all = TRUE)
d <- unique(d) d
Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
<- lit.cohort.data[lit.cohort.data$mut_type == "missense" & lit.cohort.data$isoform=="A",]
d
# add all possible variants
<-data.frame(unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"var"]), stringsAsFactors = F)
allvariantsnames(allvariants)<-"var"
$isoform<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"isoform"])
allvariants$mut_type<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"mut_type"])
allvariants<-merge(d,allvariants,all = T)
ais.na(a$total_carriers),"total_carriers"] <- 0
a[is.na(a$lqt2),"lqt2"] <- 0
a[<-a
d
# set initial weighting and penetrance
$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literature d[d
Use observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot diagnosis probability density versus residue
# Mean squared error
<- function(sm) {
mse mean((sm$residuals)^2*(sm$weights))
}
# Derive alpha and beta from weighted mean and MSE (estimated variance)
<- function(mu, var) {
estBetaParams <- ((1 - mu) / var - 1 / mu) * mu ^ 2
alpha <- alpha * (1 / mu - 1)
beta return(params = list(alpha = alpha, beta = beta))
}
# Weighted mean to determine LQT2 penetrance empirical prior
= data.frame(wt=1)
newdata <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
model summary(model)
##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.36142 -0.25639 -0.03599 0.06351 0.61328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.36170 0.01309 27.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2574 on 950 degrees of freedom
## (6240 observations deleted due to missingness)
<-predict(model, newdata)
p<- mse(model) #p*(1-p)
dev
# Estimated shape parameters for LQT2 empirical prior
= estBetaParams(p,dev)$alpha
alpha0 = estBetaParams(p,dev)$beta
beta0 print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))
## [1] "alpha0 = 0.899678647955565 beta0 = 1.58771266711751"
# Bayesian LQT2 penetrance estimates from empirical priors
# and observed affected/unaffected counts:
$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial d
With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication. !!! NOTE: since these data are truly the “best estimates” we include all variants in the calculation such that unique scores are by residue not by variant.
"lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
h2.covariates[, <-h2.covariates[h2.covariates$isoform=="A",]
h2.covariates<-0
ldfor(rec in seq(2,1159,1)){
#print(rec)
<- funcdist(rec, "var", d[!is.na(d$total_carriers) & d$total_carriers>0 & d$isoform == "A" & d$mut_type != "nonsense",], h2dist, "penetrance_lqt2", "sigmoid", 7)
ld !is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var) & !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist"] <- ld[1]
h2.covariates[!is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var)& !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist_weight"] <-ld[2]
h2.covariates[
}
# Plot lqt2_dist versus residue number
$resnum<-as.integer(h2.covariates$resnum)
h2.covariates<- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
h2.covariates
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 penetrance for all variants including
# those witheld during model construction. This will make validation easier.
<- merge(d, h2.covariates, all = TRUE)
d <- unique(d)
d
# annotate structural location (hotspot)
$Structure<-NA
d!is.na(d$lqt2_dist) & d$lqt2_dist<0.1,"Structure"]<-"Non_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.1 & d$lqt2_dist<0.4,"Structure"]<-"Mild_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.4,"Structure"]<-"Hotspot"
d[
# annotate functional perturbation
$Function<-NA
d!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.25,"Function"]<-"Severe Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.5 & d$ht_tailPeak>=0.25,"Function"]<-"Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.5 & d$ht_tailPeak<0.75,"Function"]<-"LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.75 & d$ht_tailPeak<1.25,"Function"]<-"Normal"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=1.25,"Function"]<-"GOF" d[
Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
load("prepared_data.RData")
load("lit_all_data_checkpoint.RData")
<-cohort.data[!is.na(cohort.data$resnum) & cohort.data$resnum<2000,]
cohort.data
$weight = 1-1/(0.01+cohort.data$total_carriers)
cohort.data$penetrance_lqt2 <- cohort.data$lqt2/cohort.data$total_carriers
cohort.data
for (variant in cohort.data$var) {
if (!is.na(match(variant, d$var))) {
$var == variant, c("p_mean_w","alpha", "beta", "lqt2_lit", "unaff_lit", "total_carriers_lit", "lqt2_dist")] <- d[match(variant, d$var), c("p_mean_w", "alpha", "beta", "lqt2", "unaff", "total_carriers", "lqt2_dist")]
cohort.data[cohort.data
}
}
<- cohort.data %>%
mselect(resnum, pmean = penetrance_lqt2) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
<- loess(cohort.data[,"penetrance_lqt2"]~as.numeric(cohort.data[,"resnum"]), span = 0.15)
fit plot(cohort.data$resnum, cohort.data$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 diagnosis probability", col = "red", pch=19)
<- seq(min(fit$x), max(fit$x), length.out = 100)
xrange <- predict(fit, xrange, se=T)
ps lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)
# Add covariates to cohort dataset
<- merge(cohort.data, h2.covariates, all = TRUE)
cohort.data <- unique(cohort.data)
cohort.data
# Save cohort data
save(cohort.data, file = "cohort_checkpoint.RData")
Use the observed diagnosis probability from as the TRUE diagnosis probability, generate n binomial observations
Use the final EM algorithm posterior as the prior for Beta-Binomial, incorporate data from step (1), generate the posterior distribution, and get 95% credible interval.
Check whether the interval cover the true diagnosis probability from Step 1.
Repeat Step 1 to Step 3 N times to get the coverage rate.
<- function(var,n=100,N=1000,true){
BootsCoverage
# var: variant name
# n: number of subjects in the new data
# N: number of Bootstrap
# extract the "true" diagnosis probability
<- d[d$var==var,true]
true.p
# generate binomial data
<- rbinom(N,n,true.p)
event
# get the posterior credible interval
<- d$alpha[which(d$var==var)]
alpha <- d$beta[which(d$var==var)]
beta
<- alpha + event
new.alpha <- beta + n - event
new.beta
<- qbeta(0.025,new.alpha,new.beta)
lb <- qbeta(0.975,new.alpha,new.beta)
ub
# change lb to floor of nearest 0.1
<- floor(lb*20)/20
lb <- ceiling(ub*20)/20
ub
return(sum(lb < true.p & ub > true.p)/N)
}
The coverage plot where observed diagnosis probability is the “true” diagnosis probability and one hundred new observations are added is shown below.
# load data, literature + gnomAD + cohort
load("lit_all_data_checkpoint.RData")
load("cohort_checkpoint.RData")
=function(xName,yName,weightName,nPerms,new.mat2){
calcPval# Pulls out variables
=new.mat2[,xName]
x=new.mat2[,yName]
y=new.mat2[,weightName]
w=x[!is.na(x)]
x2=y[!is.na(x)]
y2=w[!is.na(x)]
w2
# Calculate the real correlation
=weightedCorr(x2,y2,method='spearman',weights=w2)
realCorr# Do permutations, calculate fake correlations
=c()
permutedCorrListfor(permNum in 1:nPerms){
=sample(x2,length(x2),replace=FALSE)
permutedX=weightedCorr(permutedX,y2,method='spearman',weights=w2)
wCorrSim=c(permutedCorrList,wCorrSim)
permutedCorrList
}=abs(permutedCorrList)
permutedCorrList2=abs(realCorr)
realCorr2
# Calculate pvalue
=sum(realCorr2<permutedCorrList2)
summ=summ/nPerms
pValuereturn(list(realCorr,pValue,length(x2)))
}
=function(yList,xList,nPerms,weightName,new.mat2){
calcAllPvals=0
i=data.frame()
resultTablefor(yName in yList){
for(xName in xList){
=i+1
i=calcPval(xName,yName,weightName,nPerms,new.mat2)
result'x']=xName
resultTable[i,'y']=yName
resultTable[i,'nPerms']=nPerms
resultTable[i,'weightedCorr']=result[[1]]
resultTable[i,'pValue']=result[[2]]
resultTable[i,'n']=result[[3]]
resultTable[i,#print(resultTable[i,'pValue'])
}
}print(resultTable)
return(resultTable)
}
# Select covariates from isoform "A" in cohort dataset
<- cohort.data[!is.na(cohort.data$total_carriers) & cohort.data$isoform == "A" & cohort.data$mut_type == "missense",]
cohort.data <- cohort.data[,!names(cohort.data) %in% "cardiacboost"]
cohort.data <- unique(cohort.data)
cohort.data
<-cohort.data[!is.na(cohort.data$provean_score) & !is.na(cohort.data$revel_score) & !is.na(cohort.data$pamscore),]
tmp=c('penetrance_lqt2')
yList=c('hm_ssPeak','hm_tailPeak','hm_vhalfact','hm_vhalfinact','hm_recovfrominact', 'hm_taudeact_fast',
xList'ht_ssPeak','ht_tailPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact', 'ht_taudeact_fast',
'pph2_prob', 'provean_score', "blast_pssm",
'pamscore', 'aasimilaritymat', "lqt2_dist", "revel_score", 'p_mean_w')
<-calcAllPvals(yList, xList, 1000, 'weight', tmp) resultTable
## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 -0.13782555 0.755 8
## 2 hm_tailPeak penetrance_lqt2 1000 -0.45322142 0.011 46
## 3 hm_vhalfact penetrance_lqt2 1000 0.17406867 0.431 34
## 4 hm_vhalfinact penetrance_lqt2 1000 0.58824513 0.104 13
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.01793348 0.978 8
## 6 hm_taudeact_fast penetrance_lqt2 1000 0.09551958 0.749 26
## 7 ht_ssPeak penetrance_lqt2 1000 -0.72712827 0.033 10
## 8 ht_tailPeak penetrance_lqt2 1000 -0.74007175 0.000 38
## 9 ht_vhalfact penetrance_lqt2 1000 -0.06920233 0.774 21
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.42544340 0.180 13
## 11 ht_recovfrominact penetrance_lqt2 1000 0.31291231 0.511 4
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.40856426 0.177 13
## 13 pph2_prob penetrance_lqt2 1000 0.29946329 0.000 255
## 14 provean_score penetrance_lqt2 1000 -0.40683895 0.000 255
## 15 blast_pssm penetrance_lqt2 1000 -0.08826634 0.232 255
## 16 pamscore penetrance_lqt2 1000 -0.03980324 0.612 255
## 17 aasimilaritymat penetrance_lqt2 1000 0.09491418 0.207 255
## 18 lqt2_dist penetrance_lqt2 1000 0.48764962 0.000 255
## 19 revel_score penetrance_lqt2 1000 0.46266065 0.000 255
## 20 p_mean_w penetrance_lqt2 1000 0.53708337 0.000 255
rm(tmp)
rm(t)
=0
i<-data.frame()
tmpfor (x in xList){
=i+2
i-1,"Feature"]<-x
tmp[i<-d[!is.na(d[,x]) & d$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
t"Feature"]<-paste(x,"_cohort")
tmp[i,<-d[!is.na(d[,x]) & d$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
t<- boot(t, function(data,indices)
foo weightedCorr(t[indices,x],t$penetrance_lqt2[indices], method="spearman", weights = t$weight[indices]), R=1000)
-1,"Spearman"]<-foo$t0
tmp[i-1,"Spearman_low"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[1][[1]]
tmp[i-1,"Spearman_high"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[2][[1]]
tmp[i-1,"n"]<-length(t[,x])
tmp[i<-cohort.data[!is.na(cohort.data[,x]) & cohort.data$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
t<- boot(t, function(data,indices)
foo weightedCorr(t[indices,x],t$penetrance_lqt2[indices], method="spearman", weights = t$weight[indices]), R=1000)
"Spearman"]<-foo$t0
tmp[i,"Spearman_low"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[1][[1]]
tmp[i,"Spearman_high"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[2][[1]]
tmp[i,"n"]<-length(t[,x])
tmp[i,
}
forestplot(tmp$Feature,tmp$Spearman,tmp$Spearman_low,tmp$Spearman_high)
# reset "tmp" variable
<-cohort.data[!is.na(cohort.data$provean_score) & !is.na(cohort.data$revel_score) & !is.na(cohort.data$pamscore),]
tmp# Weighted R2 between observed LQT2 penetrance and post-test probability
<- boot(tmp, function(data,indices)
foo weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability")
## [1] "EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability"
$t0 foo
## [1] 0.3281077
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.2155762 0.4505238
<- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 diagnosis probability density versus observed cohort LQT2 diagnosis probability")
## [1] "LQT2 diagnosis probability density versus observed cohort LQT2 diagnosis probability"
$t0 foo
## [1] 0.2513792
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.1458213 0.3669153
<- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL score versus observed cohort LQT2 diagnosis probability")
## [1] "REVEL score versus observed cohort LQT2 diagnosis probability"
$t0 foo
## [1] 0.2316343
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.1338631 0.3560617
# Evaluate only variants with Heterozygous peak tail current measured.
<-cohort.data[!is.na(cohort.data$ht_tailPeak),]
tmp<-calcAllPvals(yList, xList, 1000, 'weight', tmp) resultTable
## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.972864918 0.300 4
## 2 hm_tailPeak penetrance_lqt2 1000 -0.251150899 0.258 28
## 3 hm_vhalfact penetrance_lqt2 1000 -0.005613182 0.988 14
## 4 hm_vhalfinact penetrance_lqt2 1000 -0.587879495 0.300 6
## 5 hm_recovfrominact penetrance_lqt2 1000 -1.000000000 0.000 2
## 6 hm_taudeact_fast penetrance_lqt2 1000 0.154267674 0.797 6
## 7 ht_ssPeak penetrance_lqt2 1000 -0.752439389 0.022 11
## 8 ht_tailPeak penetrance_lqt2 1000 -0.750024456 0.000 39
## 9 ht_vhalfact penetrance_lqt2 1000 0.107075664 0.690 20
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.499958288 0.088 14
## 11 ht_recovfrominact penetrance_lqt2 1000 0.312912311 0.494 4
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.558320083 0.085 11
## 13 pph2_prob penetrance_lqt2 1000 0.366819707 0.040 38
## 14 provean_score penetrance_lqt2 1000 -0.292846391 0.125 38
## 15 blast_pssm penetrance_lqt2 1000 0.172315938 0.356 38
## 16 pamscore penetrance_lqt2 1000 -0.002273073 0.987 39
## 17 aasimilaritymat penetrance_lqt2 1000 0.247567799 0.171 39
## 18 lqt2_dist penetrance_lqt2 1000 0.666753766 0.000 39
## 19 revel_score penetrance_lqt2 1000 0.709843622 0.000 39
## 20 p_mean_w penetrance_lqt2 1000 0.763687875 0.000 39
<- boot(tmp, function(data,indices)
foo weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability")
## [1] "EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability"
$t0 foo
## [1] 0.5640682
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.2843941 0.7730929
<- lm(penetrance_lqt2~ht_tailPeak, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("Heterozygously measured peak tail current versus observed cohort LQT2 diagnosis probability")
## [1] "Heterozygously measured peak tail current versus observed cohort LQT2 diagnosis probability"
$t0 foo
## [1] 0.4953738
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.2951170 0.6951972
<- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed cohort LQT2 diagnosis probability")
## [1] "LQT2 probability density versus observed cohort LQT2 diagnosis probability"
$t0 foo
## [1] 0.4031363
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.1663152 0.6468372
<- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL score versus observed cohort LQT2 diagnosis probability")
## [1] "REVEL score versus observed cohort LQT2 diagnosis probability"
$t0 foo
## [1] 0.4568439
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.2465301 0.7131307
load("lit_all_data_checkpoint.RData")
<-d[!is.na(d$provean_score) & !is.na(d$penetrance_lqt2) & !is.na(d$lqt2_dist) & !is.na(d$revel_score),]
tmp
<- boot(tmp, function(data,indices)
foo weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability")
## [1] "EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability"
$t0 foo
## [1] 0.8168311
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.7677140 0.8596818
<- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed literature LQT2 diagnosis probability")
## [1] "LQT2 probability density versus observed literature LQT2 diagnosis probability"
$t0 foo
## [1] 0.4902899
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.3887100 0.5914084
<- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL versus observed literature LQT2 diagnosis probability")
## [1] "REVEL versus observed literature LQT2 diagnosis probability"
$t0 foo
## [1] 0.3920345
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.3249512 0.4562831
# Evaluate only variants with Heterozygous peak tail current measured.
<-d[!is.na(d$ht_tailPeak) & !is.na(d$provean_score) & !is.na(d$penetrance_lqt2) & !is.nan(d$penetrance_lqt2) & !is.na(d$lqt2_dist),]
tmp
<- boot(tmp, function(data,indices)
foo weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability")
## [1] "EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability"
$t0 foo
## [1] 0.8884971
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.8053722 0.9438262
<- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed literature LQT2 diagnosis probability")
## [1] "LQT2 probability density versus observed literature LQT2 diagnosis probability"
$t0 foo
## [1] 0.5945224
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.4220788 0.7419627
<- lm(penetrance_lqt2~ht_tailPeak, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("Heterozygous peak tail current versus observed literature LQT2 diagnosis probability")
## [1] "Heterozygous peak tail current versus observed literature LQT2 diagnosis probability"
$t0 foo
## [1] 0.3390998
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.1814097 0.5573245
<- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
model <-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
mod<- boot(mod, function(data,indices)
foo weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL versus observed literature LQT2 diagnosis probability")
## [1] "REVEL versus observed literature LQT2 diagnosis probability"
$t0 foo
## [1] 0.4286368
quantile(foo$t,c(0.025,0.975))
## 2.5% 97.5%
## 0.2418773 0.6251751
AUCs from ROCs predicting EM posteriors
## [1] "263 218"
## [1] "263 216"
## [1] "263 214"
## [1] "263 212"
## [1] "263 205"
## [1] "263 193"
## [1] "263 193"
## [1] "263 190"
## [1] "263 190"
## [1] "263 160"
## [1] "263 158"
## [1] "263 153"
## [1] "263 135"
## [1] "263 135"
## [1] "263 127"
## Area under the curve: 0.6648
## Area under the curve: 0.5209
## Area under the curve: 0.7175
## Area under the curve: 0.5836
## Area under the curve: 0.8093
## Area under the curve: 0.8228
## [1] "741 234"
## [1] "741 231"
## [1] "741 229"
## [1] "741 223"
## [1] "741 218"
## [1] "741 211"
## [1] "741 208"
## [1] "741 205"
## [1] "741 205"
## [1] "741 193"
## [1] "741 192"
## [1] "741 190"
## [1] "741 183"
## [1] "741 179"
## [1] "741 174"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
## [1] "338 94"
The code to produce the forest plots is shown in the code chunk.
rm(d)
# Loading combined literature, gnomAD, and cohort dataset
load("lit_all_data_checkpoint.RData")
<-d[d$total_carriers>0 & d$mut_type!="nonsense" & d$isoform=="A",]
d<-d[!is.na(d$var),]
d
<- (d$alpha + d$lqt2)/(d$alpha+d$beta+d$total_carriers)
mean.post <- (d$alpha)/(d$alpha+d$beta)
mean.prior
<- qbeta(0.025,d$alpha,d$beta)
lower.prior <- qbeta(0.975,d$alpha,d$beta)
higher.prior
<- qbeta(0.025,d$alpha+d$lqt2,d$beta+d$total_carriers-d$lqt2)
lower.post <- qbeta(0.975,d$alpha+d$lqt2,d$beta+d$total_carriers-d$lqt2)
higher.post
<- data.frame(variant = d$var, mean=mean.post,
forest.data.post lower=lower.post, higher=higher.post, resnum=d$resnum, tc=d$total_carriers, lqt2=d$lqt2)
$group<-"posterior"
forest.data.post
<- data.frame(variant = d$var, mean=mean.prior,
forest.data.prior lower=lower.prior, higher=higher.prior, resnum=d$resnum, tc=d$total_carriers, lqt2=d$lqt2)
$group<-"prior"
forest.data.prior
<-rbind(forest.data.post, forest.data.prior)
forest.data<-forest.data[order(forest.data$resnum, forest.data$variant),]
forest.data$label<-""
forest.data$group=="posterior","label"]<-paste(forest.data[forest.data$group=="posterior","lqt2"], "/", forest.data[forest.data$group=="posterior","tc"])
forest.data[forest.data
#define colours for dots and bars
= c("#866D4B","#000000")
dotCOLS = c("#FFFFFF","#FFFFFF")
barCOLS
<- function(a,b){
plotg <-forest.data[a:b,]
fdpng( paste("Plots/", a, "-",b,"pics.png",sep=""),res=300,height=10,width=10,units="in")
<-ggplot(fd, aes(x=reorder(variant,-resnum), y=mean, ymin=lower, ymax=higher, col=group, fill=group)) +
pgeom_text(data=fd, aes(x=reorder(variant,-resnum), label=label)) +
#specify position here
geom_linerange(size=2,position=position_dodge(width = 1)) +
geom_hline(yintercept=1, lty=1) +
geom_hline(yintercept=0, lty=1) +
#specify position here too
geom_point(size=2, shape=21, colour="white", stroke = 0.5,position=position_dodge(width = 1)) +
scale_fill_manual(values=barCOLS)+
scale_color_manual(values=dotCOLS)+
scale_y_continuous(name="LQT2 Diagnosis Probability", limits = c(0, 1)) +
coord_flip() +
theme_minimal()
print(p)
dev.off()
}
sapply(0:30*50+1,function(x) plotg(x,x+49) )
The forests plots are pasted below, where gold bars are posteriors, and black bars are EM priors. Variant name is given to the left and the number of LQT2 affected / total heterozygote count is given above the dot indicating the posterior mean.