Introduction

This document describes an estimate of the positive predictive value (PPV) of variant discovery for all variants in the gene KCNH2 on long QT syndrome type 2 (LQT2). Additional details on the methods used are published Kroncke et al. 2020 PLOS Genetics and at the following website: (https://oates.app.vumc.org/vancart/SCN5A/SCN5A-report.html). We use observed and estimated probability of LQT2 diagnosis for all known KCNH2 variants as a way to assess the per variant PPV for variant discovery. Our objective is to develope a prior estimate of the per variant PPV on LQT2 which incorporates structure, function, and in silico predictors. We use these in silico and in vitro data to generate a Bayesian prior estimate of the per variant PPV since these data can be generated in a lab setting, unlike heterozygotes/carriers of KCNH2 variants which may or may not exist. The final posterior estimate combines this derived prior and clinically phenotyped heterozygotes/carriers.

Part 1: Calculate probability of LQT2 diagnosis and LQT2 Probability Density using Various Subsets of the Literature and Cohort Data

All Literature Variants

# mut_type has includes type and isoform for all variants in the literature and in the cohort.
mut_type <- mut_type[mut_type$mut_type == "missense",]

# Cohort carrier/heterozygote counts and variant IDs. 
# Here we select only the missense variants (in-frame indels are also included as "missense")
cohort.data <- cohort.data[cohort.data$mut_type == "missense" & cohort.data$total_carriers > 0,]

# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]

# Here, heterozygotes/carriers from gnomAD are removed to test their influence on performance
# Remove comments in next two lines and complete subsequent evaluation to assess influence of gnomAD on
# calculations.
#d$total_carriers <- d$total_carriers - d$gnomAD # test influence of gnomAD
#d$unaff <- d$total_carriers - d$lqt2 # test influence of gnomAD

# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" when evaluating ROC-AUC of n=1 variants from the literature

LQT2 empirical LQTS diagnosis probability prior

Use observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot probability density versus residue

# Mean squared error
mse <- function(sm) {
  mean((sm$residuals)^2*(sm$weights))
}

# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)
## 
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2329 -0.1653 -0.0232  0.0763  0.7561 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.2332     0.0135   17.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2323 on 789 degrees of freedom
##   (467 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model)#p*(1-p)

# Derive alpha and beta from weighted mean and MSE (estimated variance)
estBetaParams <- function(mu, var) {
  alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
  beta <- alpha * (1 / mu - 1)
  return(params = list(alpha = alpha, beta = beta))
}

# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, "  beta0 = ", beta0))
## [1] "alpha0 =  0.54041284982238   beta0 =  1.77720039413882"
# Bayesian LQT2 penetrance estimates from empirical priors 
# and observed affected/unaffected counts:
d$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial

# Plot literature observed LQT2 penetrance versus residue number
m<- d %>% 
  select(resnum, pmean = penetrance_lqt2) %>% 
  mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))

fit <- loess(d[,"penetrance_lqt2"]~as.numeric(d[,"resnum"]), span = 0.15)
plot(d$resnum, d$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 Penetrance Estimate")
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)

Calculate LQTS probability densities

With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.

# NOTE: adjust 3rd argument given to funcdist, "d,", to d[d$total_carriers>1,] when 
# evaluating ROC of total_carriers == 1

h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
mut_type <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
for(rec in 1:nrow(mut_type)){
  if (!is.na(mut_type[rec, "resnum"])) {
    l<-length(h2.covariates[h2.covariates$var == mut_type[rec, "var"],"var"])
    for (m in 1:l){
    h2.covariates[h2.covariates$var == mut_type[rec, "var"], 
                  c("lqt2_dist", "lqt2_dist_weight", "resnum")][m,] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d, h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
    }
  }
}

# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
m<- h2.covariates %>% 
  select(resnum, pmean = lqt2_dist) %>% 
  mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))

fit <- loess(h2.covariates[,"lqt2_dist"]~as.numeric(h2.covariates[,"resnum"]), span = 0.15)
plot(h2.covariates$resnum, h2.covariates$lqt2_dist, xlab ="Residue", ylab = "LQT2 Penetrance Density", xlim=c(0,1160))
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)

Calculate Weighted Spearman Correlation Coefficients

Evaluate weighted Spearman correlations coefficients between observed LQT2 diagnosis probability in the literature and various potential predictors

# Merge "d" with full variant list and set carrier counts to 0. 
# This is done for convenience so we can estimate LQT2 diagnosis probability for all variants including 
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d) 

calcPval=function(xName,yName,weightName,nPerms,new.mat2){
  # Pulls out variables

  x=new.mat2[,xName] 
  y=new.mat2[,yName] 
  w=new.mat2[,weightName]
  x2=x[!is.na(x)]
  y2=y[!is.na(x)]
  w2=w[!is.na(x)]

  # Calculate the real correlation
  realCorr=weightedCorr(x2,y2,method='spearman',weights=w2)
  # Do permutations, calculate fake correlations
  permutedCorrList=c()
  for(permNum in 1:nPerms){
    permutedX=sample(x2,length(x2),replace=FALSE)
    wCorrSim=weightedCorr(permutedX,y2,method='spearman',weights=w2)
    permutedCorrList=c(permutedCorrList,wCorrSim)
  }
  permutedCorrList2=abs(permutedCorrList)
  realCorr2=abs(realCorr)
  
  # Calculate pvalue
  summ=sum(realCorr2<permutedCorrList2)
  pValue=summ/nPerms
  return(list(realCorr,pValue,length(x2)))
}

calcAllPvals=function(yList,xList,nPerms,weightName,new.mat2){
  i=0
  resultTable=data.frame()
  for(yName in yList){
    for(xName in xList){
      i=i+1
      result=calcPval(xName,yName,weightName,nPerms,new.mat2)
      resultTable[i,'x']=xName
      resultTable[i,'y']=yName
      resultTable[i,'nPerms']=nPerms
      resultTable[i,'weightedCorr']=result[[1]]
      resultTable[i,'pValue']=result[[2]]
      resultTable[i,'n']=result[[3]]
      #print(resultTable[i,'pValue'])
    }
  }
  print(resultTable)
  return(resultTable)
}


yList=c('penetrance_lqt2')
xList=c('hm_ssPeak','hm_tailPeak','hm_vhalfact','hm_vhalfinact','hm_recovfrominact', 'hm_taudeact_fast', 
        'ht_ssPeak','ht_tailPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact', 'ht_taudeact_fast',
        'pph2_prob', 'provean_score', "blast_pssm",
        'pamscore', 'aasimilaritymat', "lqt2_dist", "revel_score")
tmp<-d[!is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)
##                    x               y nPerms weightedCorr pValue   n
## 1          hm_ssPeak penetrance_lqt2   1000  0.319565106  0.262  20
## 2        hm_tailPeak penetrance_lqt2   1000 -0.564758635  0.000 100
## 3        hm_vhalfact penetrance_lqt2   1000  0.147612664  0.291  68
## 4      hm_vhalfinact penetrance_lqt2   1000  0.669666858  0.000  29
## 5  hm_recovfrominact penetrance_lqt2   1000 -0.734693865  0.011  12
## 6   hm_taudeact_fast penetrance_lqt2   1000 -0.375296727  0.034  42
## 7          ht_ssPeak penetrance_lqt2   1000 -0.464626144  0.023  34
## 8        ht_tailPeak penetrance_lqt2   1000 -0.616133062  0.000  83
## 9        ht_vhalfact penetrance_lqt2   1000 -0.061932733  0.736  41
## 10     ht_vhalfinact penetrance_lqt2   1000 -0.173738913  0.454  26
## 11 ht_recovfrominact penetrance_lqt2   1000 -0.407537689  0.378   6
## 12  ht_taudeact_fast penetrance_lqt2   1000  0.009703229  0.971  14
## 13         pph2_prob penetrance_lqt2   1000  0.392919452  0.000 741
## 14     provean_score penetrance_lqt2   1000 -0.585053878  0.000 741
## 15        blast_pssm penetrance_lqt2   1000 -0.250452543  0.000 741
## 16          pamscore penetrance_lqt2   1000 -0.116758403  0.026 750
## 17   aasimilaritymat penetrance_lqt2   1000  0.018401459  0.703 750
## 18         lqt2_dist penetrance_lqt2   1000  0.583417065  0.000 748
## 19       revel_score penetrance_lqt2   1000  0.664409462  0.000 750
tmp<-d[!is.na(d$ht_tailPeak) & !is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)
##                    x               y nPerms weightedCorr pValue  n
## 1          hm_ssPeak penetrance_lqt2   1000  0.690504615  0.088 10
## 2        hm_tailPeak penetrance_lqt2   1000 -0.428867090  0.004 66
## 3        hm_vhalfact penetrance_lqt2   1000 -0.093938286  0.627 35
## 4      hm_vhalfinact penetrance_lqt2   1000  0.629956576  0.048 15
## 5  hm_recovfrominact penetrance_lqt2   1000 -0.772690799  0.286  4
## 6   hm_taudeact_fast penetrance_lqt2   1000 -0.228864954  0.405 17
## 7          ht_ssPeak penetrance_lqt2   1000 -0.464626144  0.012 34
## 8        ht_tailPeak penetrance_lqt2   1000 -0.616133062  0.000 83
## 9        ht_vhalfact penetrance_lqt2   1000 -0.061096836  0.744 40
## 10     ht_vhalfinact penetrance_lqt2   1000 -0.173348992  0.443 25
## 11 ht_recovfrominact penetrance_lqt2   1000 -0.407537689  0.402  6
## 12  ht_taudeact_fast penetrance_lqt2   1000  0.009703229  0.970 14
## 13         pph2_prob penetrance_lqt2   1000  0.332107233  0.006 81
## 14     provean_score penetrance_lqt2   1000 -0.455185289  0.000 81
## 15        blast_pssm penetrance_lqt2   1000  0.055910704  0.687 81
## 16          pamscore penetrance_lqt2   1000  0.058560727  0.628 83
## 17   aasimilaritymat penetrance_lqt2   1000  0.063049344  0.628 83
## 18         lqt2_dist penetrance_lqt2   1000  0.755839461  0.000 83
## 19       revel_score penetrance_lqt2   1000  0.612183017  0.000 83

Scale all covariates

Calculate EM priors and posteriors for all variants

Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)

Literature Variants Where N = 1 Variants are removed

# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]

# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 2,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literature

# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" when evaluating ROC-AUC of n=1 variants from the literature

LQT2 empirical diagnosis probability prior

Use observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot LQTS probability density versus residue

# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)
## 
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2329 -0.1653 -0.0232  0.0763  0.7561 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.2332     0.0135   17.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2323 on 789 degrees of freedom
##   (467 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model) #p*(1-p)

# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, "  beta0 = ", beta0))
## [1] "alpha0 =  0.54041284982238   beta0 =  1.77720039413882"
# Bayesian LQT2 penetrance estimates from empirical priors 
# and observed affected/unaffected counts:
d$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial

Calculate LQTS probability densities

With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.

h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
mut_type <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
for(rec in 1:nrow(mut_type)){
  if (!is.na(mut_type[rec, "resnum"])) {
    h2.covariates[h2.covariates$var == mut_type[rec, "var"], 
                  c("lqt2_dist", "lqt2_dist_weight", "resnum")] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d[d$total_carriers>1,], h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
  }
}

# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]

# Merge "d" with full variant list and set carrier counts to 0. 
# This is done for convenience so  we can estimate LQT2 penetrance for all variants including 
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d) 

Scale all covariates

Calculate EM priors and posteriors for all variants

Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)

Literature and Cohort Combined (for final predictions)

# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.cohort.data[lit.cohort.data$mut_type == "missense",]

# add all possible variants
allvariants<-data.frame(unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"var"]), stringsAsFactors = F)
names(allvariants)<-"var"
allvariants$isoform<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"isoform"])
allvariants$mut_type<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"mut_type"])
a<-merge(d,allvariants,all = T)
a[is.na(a$total_carriers),"total_carriers"] <- 0
a[is.na(a$lqt2),"lqt2"] <- 0
d<-a

# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literature

LQT2 empirical diagnosis probability prior

Use observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot diagnosis probability density versus residue

# Mean squared error
mse <- function(sm) {
  mean((sm$residuals)^2*(sm$weights))
}

# Derive alpha and beta from weighted mean and MSE (estimated variance)
estBetaParams <- function(mu, var) {
  alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
  beta <- alpha * (1 / mu - 1)
  return(params = list(alpha = alpha, beta = beta))
}

# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)
## 
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3424 -0.2429 -0.0341  0.0654  0.6315 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.34273    0.01253   27.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2541 on 1017 degrees of freedom
##   (6240 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model) #p*(1-p)

# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, "  beta0 = ", beta0))
## [1] "alpha0 =  0.854290338628253   beta0 =  1.63833103141397"
# Bayesian LQT2 penetrance estimates from empirical priors 
# and observed affected/unaffected counts:
d$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial

Calculate LQTS probability densities and annotate function and structural location

With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication. !!! NOTE: since these data are truly the “best estimates” we include all variants in the calculation such that unique scores are by residue not by variant.

h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
ld<-0
for(rec in seq(2,1159,1)){
  #print(rec)
  ld <- funcdist(rec, "var", d[!is.na(d$total_carriers) & d$total_carriers>0 & d$isoform == "A" & d$mut_type != "nonsense",], h2dist, "penetrance_lqt2", "sigmoid", 7)
  h2.covariates[!is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var) & !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist"] <- ld[1]
  h2.covariates[!is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var)& !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist_weight"] <-ld[2] 
}

# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]

# Merge "d" with full variant list and set carrier counts to 0. 
# This is done for convenience so  we can estimate LQT2 penetrance for all variants including 
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d) 

# annotate structural location (hotspot) 
d$Structure<-NA
d[!is.na(d$lqt2_dist) & d$lqt2_dist<0.1,"Structure"]<-"Non_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.1 & d$lqt2_dist<0.4,"Structure"]<-"Mild_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.4,"Structure"]<-"Hotspot"

# annotate functional perturbation
d$Function<-NA
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.25,"Function"]<-"Severe Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.5 & d$ht_tailPeak>=0.25,"Function"]<-"Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.5 & d$ht_tailPeak<0.75,"Function"]<-"LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.75 & d$ht_tailPeak<1.25,"Function"]<-"Normal"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=1.25,"Function"]<-"GOF"

Calculate EM priors and posteriors for all variants

Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)

LQT2 diagnosis probability in Cohort Data for validation

load("prepared_data.RData")
load("lit_all_data_checkpoint.RData")

cohort.data$weight = 1-1/(0.01+cohort.data$total_carriers)
cohort.data$penetrance_lqt2 <- cohort.data$lqt2/cohort.data$total_carriers

for (variant in cohort.data$var) {
  if (!is.na(match(variant, d$var))) {
     cohort.data[cohort.data$var == variant, c("p_mean_w","alpha", "beta", "lqt2_lit", "unaff_lit", "total_carriers_lit", "lqt2_dist")] <- d[match(variant, d$var), c("p_mean_w", "alpha", "beta", "lqt2", "unaff", "total_carriers", "lqt2_dist")]
  }
}

m<- cohort.data %>% 
  select(resnum, pmean = penetrance_lqt2) %>% 
  mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))

fit <- loess(cohort.data[,"penetrance_lqt2"]~as.numeric(cohort.data[,"resnum"]), span = 0.15)
plot(cohort.data$resnum, cohort.data$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 diagnosis probability", col = "red", pch=19)
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)