1 Introduction and model equations

In this document, we simulate some data to test the model ability to accurately estimate two parameters of interest: the coefficient \(\beta_X\) of the potential drivers of the within-population genetic variation (i.e. variation among clones) and the standard deviation \(\sigma_K\) of the within-population genetic variation.

We tried two models: one with the effect of the potential drivers applying on the mean of the within-population genetic variation, the other on the median. Here are the two model equations.

Model 1 (effect on the median)

Model 2 (effect on the mean)

\[\begin{equation} \begin{aligned} y_{bpcr} & \sim \mathcal{N}(\mu_{bpc},\sigma^{2}_{r})\\[3pt] \mu_{bpc} & = \beta_{0} + B_{b} + P_{p} + C_{c(p)}\\[3pt] \beta_{0} & \sim \mathcal{N}(\mu_{y},2)\\[3pt] \begin{bmatrix} B_{b} \\ P_{p} \end{bmatrix} &\sim \mathcal{N}\left(0, \begin{bmatrix} \sigma^{2}_{B}\\[3pt] \sigma^{2}_{P}\\ \end{bmatrix} \right)\\[3pt] C_{c(p)} & \sim \mathcal{N}(0,\sigma^{2}_{C_{p}})\\[3pt] \overline{\sigma_{C_{p}}^{2}} & = \frac{\sum_{p=1}^{33} \sigma_{C_{p}}^{2}}{33} \\[3pt] \sigma_{tot}^{2} & = \sigma_{r}^{2} + \sigma_{B}^{2} + \overline{\sigma_{C_{p}}^{2}} + \sigma_{P}^{2}\\[3pt] \sigma_{r} & = \sigma_{tot} \times \sqrt(\pi_{1})\\[3pt] \sigma_{B} & = \sigma_{tot} \times \sqrt(\pi_{2})\\[3pt] \sigma_{P} & = \sigma_{tot} \times \sqrt(\pi_{3})\\[3pt] \sigma_{C_{p}} & \sim \mathcal{LN}\left(\ln\left(\sigma_{tot} \times \sqrt(\pi_{4}) + \beta_{x}X_{p} \right)-\frac{\sigma^{2}_{K}}{2},\sigma^{2}_{K}\right)\\[3pt] \sigma_{K} & \sim \exp(1)\\[3pt] \sigma_{tot} & \sim \mathcal{S}^{*}(0,1,3) \end{aligned} \end{equation}\] Comments for the log-normal: \[\begin{equation} \begin{aligned} \sigma_{C_{p}} \sim \mathcal{LN}(\mu,\sigma^{2}) & \Leftrightarrow \ln(\sigma_{C_{p}}) \sim \mathcal{N}(\mu,\sigma^{2})\\[3pt] \text{By definition: } \mathbb{E}(\sigma_{C_{p}}) & = \exp\left(\mu + \frac{\sigma^{2}}{2}\right)\\[3pt] \text{We want: } \mathbb{E}(\sigma_{C_{p}}) & = \exp\left(\mu + \frac{\sigma^{2}}{2}\right) = \sigma_{tot} \times \sqrt(\pi_{4})\\[3pt] \text{Therefore: } \mu & = \ln\left(\sigma_{tot} \times \sqrt(\pi_{4})\right) - \frac{\sigma^{2}}{2}\\ \end{aligned} \end{equation}\]

2 Parameters and functions to simulate the data

2.1 Parameters and options

traits <- c("AST_htdec11", # Asturias, Spain - November 2012
            "POR_htoct12", # Fundao, Portugal - October 2012
            "BDX_htnov13") # Bordeaux, France - November 2013

# Size of the simulated dataset
nb.clones = "real-based"
#nb.clones = 20 # 20 clones per population  (5280 trees, 33 populations, 20 clones per population) 


# Which model? 
which.model <- "model1"


# True values of the parameters
##############################

# betaX
if(which.model=="model1") {betaX <- 0.1} else if(which.model=="model2"){betaX <- 1.2}

# Variance of the log-normal (distribution of the within-population genetic variation)
sigma_K <- 0.1

# Variance partitioning
pi1 <- 0.5 # prop of variance explained by residuals
pi2 <- 0.05 # prop of variance explained by block effects
pi3 <- 0.15 # prop of variance explained by the provenance
pi4 <- 0.3 # prop of variance explained by the clones effects


# MCMC parameters
#################

# Options for model sampling
n_chains <- 4
n_iter <- 2500
n_warm <- 1250
n_thin <- 1
save_warmup = FALSE 

# option for the MCMC intervals
point.est="median"
prob.in <- 0.9
prob.out <- 0.95
conf.level <- 0.95

2.1.1 Functions

2.1.1.1 SimulData()

#### Function to simulate the data ####
SimulData <- function(trait,nb.set.seed){
set.seed(nb.set.seed)

# As we want to base our simulations on the real experimental design, 
  # we load the trait data and extract the number of block, populations,
  # clones per population and trees per clone: 
df <- readRDS(file="data/ClonapinData/PhenoDataNovember2019_AnnualTraits.rds") %>% 
  dplyr::filter(!(prov=="MAD")) %>% 
  dplyr::rename(trait=all_of(trait)) %>% 
  dplyr::select(prov,clon,tree,block,trait) %>% 
  drop_na() %>% 
  arrange(by=tree)

# We keep the same trait mean as the real data:
beta0 <- mean(df$trait)

# Same for the standard deviation and variance of the trait:
sigma_tot <- sqrt(var(df$trait))
sigma2_tot <- var(df$trait)


# If we don't want to base our simulations on the real experimental design, 
  # we have to run these lines: we create a perfectly balanced dataset of 33 populations, 
  # 20 clones per populations and 1 clone per block
if(nb.clones==20){
  
vec.prov <- vector()
for(i in unique(df$prov)){vec.prov <- c(vec.prov,rep(i,nb.clones*8))}

vec.blocks <- vector()
for(i in 1:8){vec.blocks <- c(vec.blocks,rep(i,nb.clones))}

df <- tibble(prov=vec.prov,
             clon=paste0(prov,rep(1:nb.clones,264)),
             block=rep(vec.blocks,33),
             tree=paste0(clon,"_",block))
}


# We want: Var(Pop) + Var(Block) + Var(Clone) + Var(Residual) = Var(total)
sigma_prov <-  sqrt(sigma2_tot*pi3) # Among populations variation explains 15% of the total variance
sigma_block <-  sqrt(sigma2_tot*pi2) # Among blocks variation explains 5% of the total variance
sigma_r <- sqrt(sigma2_tot*pi1) # Residuals explain 50% of the total variance
mean_sigma_clon <- sqrt(sigma2_tot*pi4) # Among clones variation explains 30% of the total variance 

if(which.model=="model1"){  # Model 1
# In the first model, the effect of the potential drivers is 
  # on the median of the within-population genetic variation
 
Xp <- rnorm(length(unique(df$prov)),mean=0,sd=1) # Potential driver that is mean-centered
mu_sigma_clon <- log(mean_sigma_clon) - (sigma_K*sigma_K)/2  + betaX*Xp
log_sigma_clon <-  rnorm(length(unique(df$prov)),mu_sigma_clon, sigma_K )
sigma_clon <- exp(log_sigma_clon)

} else if(which.model=="model2"){ # Model 2
# In the second model, the effect of the potential drivers is
  # on the mean of the within-population genetic variation

Xp <- rnorm(length(unique(df$prov)),mean=0,sd=1)
mu_sigma_clon <- log(mean_sigma_clon + betaX*Xp) - (sigma_K*sigma_K)/2 
log_sigma_clon <-  rnorm(length(unique(df$prov)),mu_sigma_clon, sigma_K)
sigma_clon <- exp(log_sigma_clon)
}

df <- df %>% 
  inner_join(tibble(prov=unique(df$prov), # Population id
             sigma_clon= sigma_clon, # within-population genetic variation (among glones variation)
             Xp=Xp, # Potential driver of the within-population genetic variation
             alpha_prov=rnorm(length(unique(df$prov)),mean=0,sd=sigma_prov)), # population varying intercepts
             by="prov") 

sub <- df %>% 
  dplyr::select(clon,alpha_prov,sigma_clon) %>% 
  distinct()

df <- df %>% 
  inner_join(tibble(clon=sub$clon, # clone id
             alpha_clon=rnorm(length(sub$clon),mean=0,sd=sub$sigma_clon)), # clone varying intercepts
             by="clon") %>% 
  inner_join(tibble(block=unique(df$block), # block id
             alpha_block=rnorm(length(unique(df$block)),mean=0,sd=sigma_block)), # block varying intercepts
             by="block") %>% 
  dplyr::mutate(mu=beta0 + alpha_prov + alpha_block + alpha_clon, # linear predictor
                ysim=rnorm(length(df$tree),mu,sigma_r)) # trait values

list.sim <- list(df=df,
             beta0 =beta0,
             betaX=betaX,
             sigma_tot=sigma_tot,
             sigma_r=sigma_r,
             sigma_K =sigma_K,
             sigma_prov=sigma_prov,
             mean_sigma_clon =mean_sigma_clon ,
             mu_sigma_clon=mu_sigma_clon,
             Xp=Xp,
             sigma_block=sigma_block,
             pi2=(sigma_block*sigma_block)/(sigma_tot*sigma_tot),
             pi3=(sigma_prov*sigma_prov)/(sigma_tot*sigma_tot),
             pi4=mean(sigma_clon)*mean(sigma_clon)/(sigma_tot*sigma_tot),
             pi1=(sigma_r*sigma_r)/(sigma_tot*sigma_tot))
}

2.1.1.2 PlotParam()

PlotParam <- function(x)  {
  
  model.draws <- readRDS(file=paste0("outputs/simulations/OneSimulation/",x,".rds")) %>% 
    as.matrix() %>% 
    as_tibble() %>% 
    dplyr::select(all_of(params))
  
  list.sim <- readRDS(file=paste0("outputs/simulations/OneSimulation/",x,"_ListSim.rds"))
  if(grepl("pi",params[[1]])==TRUE){sim.true.values <- list.sim[c("pi1","pi2","pi3","pi4")] %>% unlist()} else 
    {sim.true.values <- list.sim[params] %>% unlist()}
  
  mcmc_recover_intervals(model.draws,
                            sim.true.values,
                            prob = prob.in,
                            prob_outer = prob.out,
                            point_est=point.est) + 
  theme(axis.text = element_text(size=16))+
  labs(subtitle="")+
  scale_x_discrete(labels=param.labels) +
       theme(legend.position = legend.position,
             legend.background = element_rect(colour = "grey"),
             legend.title = element_blank(),
             axis.text.x=element_text(size=18))
}

3 One simulation

3.1 Data simulation + running the model

# set.seed of the simulations
nb.set.seed <- 44

for(trait in traits){
# Simulate the data
list.sim <- SimulData(trait,nb.set.seed)

list.stan <- list(N=length(list.sim$df$tree),
             y=list.sim$df$ysim,
             X=list.sim$Xp,
             nprov = length(unique(list.sim$df$prov)),
             nclon = length(unique(list.sim$df$clon)),
             nblock = length(unique(list.sim$df$block)),
             prov = as.numeric(as.factor(list.sim$df$prov)),
             which_prov = as.numeric(as.factor(pull(unique(list.sim$df[c("prov","clon")])[,"prov"]))),
             clon = as.numeric(as.factor(list.sim$df$clon)),
             bloc = as.numeric(as.factor(list.sim$df$block)),
             sigma_K = 0.1)

if(which.model=="model1") {
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate.stan")
} else if(which.model=="model2"){
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate_NotWorking.stan")}

fit.model <- sampling(model, 
                      data = list.stan, 
                      pars=c("beta0","betaX",
                             "pi",
                             "sigma_tot","sigma_block","sigma_prov","mean_sigma_clon","sigma_clon","sigma_K","sigma_r"),
                      iter = n_iter, 
                      chains = n_chains, 
                      cores = n_chains,
                      save_warmup= save_warmup,
                      thin=n_thin) 

saveRDS(fit.model,file=paste0("outputs/simulations/OneSimulation/",trait,".rds"))
saveRDS(list.sim,file=paste0("outputs/simulations/OneSimulation/",trait,"_ListSim.rds"))}

3.2 SI figures

3.2.1 Standard Deviations

params  <- c("sigma_tot","sigma_r","sigma_block","sigma_prov","mean_sigma_clon")
param.labels <- c(parse(text = TeX("$mean(\\sigma_{C_{p}})$")),
                  parse(text = TeX("$\\sigma_{B}$")),
                  parse(text = TeX("$\\sigma_{P}$")),
                  parse(text = TeX("$\\sigma_{r}$")),
                  parse(text = TeX("$\\sigma_{tot}$")))
legend.position <- c(0.2,0.9)



plots <- mclapply(traits,PlotParam)
plots[[2]] <-  plots[[2]] + theme(legend.position = "none")
plots[[3]] <-  plots[[3]] + theme(legend.position = "none")


p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('Asturias (21 months)','Fundão (20 months)',"Bordeaux (25 months)"),
               label_size = 20,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_StandardDeviations.png"),height=8,width=17)
p

3.2.2 Variance partitioning

params  <- c("pi[1]","pi[2]","pi[3]","pi[4]")
param.labels <- c(parse(text = TeX("$\\pi_{r}$")),
                  parse(text = TeX("$\\pi_{B}$")),
                  parse(text = TeX("$\\pi_{P}$")),
                  parse(text = TeX("$\\pi_{C}$")))
legend.position <- c(0.8,0.9)


plots <- mclapply(traits,PlotParam)
plots[[2]] <-  plots[[2]] + theme(legend.position = "none")
plots[[3]] <-  plots[[3]] + theme(legend.position = "none")


p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('Asturias (21 months)','Fundão (20 months)',"Bordeaux (25 months)"),
               label_size = 20,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_VariancePartitioning.png"),height=8,width=17)
p

3.2.3 Sigma_k and beta_X

params  <- c("sigma_K","betaX")
param.labels <- c(parse(text = TeX("$\\beta_{X}$")),
                  parse(text = TeX("$\\sigma_{K}$")))
legend.position <- c(0.3,0.85)


plots <- mclapply(traits,PlotParam)
plots[[2]] <-  plots[[2]] + theme(legend.position = "none")
plots[[3]] <-  plots[[3]] + theme(legend.position = "none")


p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('Asturias (21 months)','Fundão (20 months)',"Bordeaux (25 months)"),
               label_size = 20,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_SigmaKBetaX.png"),height=6,width=17)
p

3.2.4 Correlations among sigma_k and beta_X

plots <- mclapply(traits,function(x){
  
fit.model <- readRDS(file=paste0("outputs/simulations/OneSimulation/",x,".rds"))
np <- nuts_params(fit.model)
p <- mcmc_pairs(as.array(fit.model), 
           np = np,
           pars = c("sigma_K","betaX"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=3, div_shape = 19),
           max_treedepth =10)
  
})

p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('A)','B)',"C)"),
               label_size = 20,vjust=1.8,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_CorrelationsSigmaKBetaX.png"),height=6,width=17)
p

3.2.5 Old figures

4 100 Simulations

4.1 Running 100 models

set.seed(23)

nb.set.seed <- sample(1:1000,100,replace=FALSE)

sim <- lapply(nb.set.seed,function(x){

  list.sim <- SimulData(trait=trait,nb.set.seed=x)
  
  list.stan <- list(N=length(list.sim$df$tree),
             y=list.sim$df$ysim,
             X=list.sim$Xp,
             nprov = length(unique(list.sim$df$prov)),
             nclon = length(unique(list.sim$df$clon)),
             nblock = length(unique(list.sim$df$block)),
             prov = as.numeric(as.factor(list.sim$df$prov)),
             which_prov = as.numeric(as.factor(pull(unique(list.sim$df[c("prov","clon")])[,"prov"]))),
             clon = as.numeric(as.factor(list.sim$df$clon)),
             bloc = as.numeric(as.factor(list.sim$df$block)),
             sigma_K = sigma_K)
  
  if(which.model=="model1") {
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate.stan")
} else if(which.model=="model2"){
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate_NotWorking.stan")}
  
  fit.model <- sampling(model, 
                        data = list.stan, 
                        pars=c("betaX","sigma_K"), 
                        iter = n_iter, 
                        chains = n_chains, 
                        cores = n_chains,
                        save_warmup = save_warmup,
                        thin=n_thin) 
  
   conf95 <- broom::tidyMCMC(fit.model,pars=(c("betaX","sigma_K")),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T,conf.level = 0.95) %>% 
     dplyr::rename(conf.low.95=conf.low,
                   conf.high.95=conf.high,
                   median=estimate)
   
   broom::tidyMCMC(fit.model,pars=(c("betaX","sigma_K")),
                droppars = NULL, estimate.method = "mean", ess = F, rhat = F, conf.int = T,conf.level = 0.80)%>% 
     dplyr::rename(conf.low.80=conf.low,
                   conf.high.80=conf.high,
                   mean=estimate) %>% 
     inner_join(conf95,by=c("term","std.error")) %>% 
     dplyr::select(term,mean,median,std.error,conf.low.80,conf.high.80,conf.low.95,conf.high.95, rhat ,  ess)
  
})

saveRDS(sim,file=paste0("outputs/simulations/100sim_",which.model,"_",trait,".rds"))

4.2 SI table

tab <-   lapply(traits, function(trait){

# betaX
sim <- readRDS(file=paste0("outputs/simulations/100sim_model1_",trait,".rds")) %>% 
  bind_rows() %>% 
  filter(term=="betaX") %>% 
  mutate(bias.mean=mean-betaX,
         bias.median=median-betaX,
         coverage.conf.80=case_when(
           betaX > conf.low.80  & betaX < conf.high.80 ~ 1,
           betaX < conf.low.80 ~ 0,
           betaX > conf.high.80 ~ 0),
         coverage.conf.95=case_when(
           betaX > conf.low.95  & betaX < conf.high.95 ~ 1,
           betaX < conf.low.95 ~ 0,
           betaX > conf.high.95 ~ 0))

tib <- tibble(term="betaX",
       true.value=betaX,
       mean.sdt.error=mean(sim$std.error),
       mean.bias.mean=mean(sim$bias.mean),
       mean.bias.median=mean(sim$bias.median),
       coverage.conf.80=sum(sim$coverage.conf.80),
       coverage.conf.95=sum(sim$coverage.conf.95))

# sigma_k
sim <- readRDS(file=paste0("outputs/simulations/100sim_model1_",trait,".rds")) %>% 
  bind_rows() %>% 
  filter(term=="sigma_K") %>% 
  mutate(bias.mean=mean-sigma_K,
         bias.median=median-sigma_K,
         coverage.conf.80=case_when(
           sigma_K > conf.low.80  & sigma_K < conf.high.80 ~ 1,
           sigma_K < conf.low.80 ~ 0,
           sigma_K > conf.high.80 ~ 0),
         coverage.conf.95=case_when(
           sigma_K > conf.low.95  & sigma_K < conf.high.95 ~ 1,
           sigma_K < conf.low.95 ~ 0,
           sigma_K > conf.high.95 ~ 0))

tib <- tib %>% 
  bind_rows(tibble(term="sigma_K",
       true.value=sigma_K,
       mean.sdt.error=mean(sim$std.error),
       mean.bias.mean=mean(sim$bias.mean),
       mean.bias.median=mean(sim$bias.median),
       coverage.conf.80=sum(sim$coverage.conf.80),
       coverage.conf.95=sum(sim$coverage.conf.95))) %>% 
  mutate(Traits=trait)
}) %>% 
  bind_rows()

tab %>% 
  kable(digits=3) %>% 
  kable_styling(latex_options =c("striped", "hold_position"))

term	true.value	mean.sdt.error	mean.bias.mean	mean.bias.median	coverage.conf.80	coverage.conf.95	Traits
betaX	0.1	0.049	-0.004	-0.004	81	96	AST_htdec11
sigma_K	0.1	0.060	0.006	0.001	93	99	AST_htdec11
betaX	0.1	0.053	-0.003	-0.003	82	99	POR_htoct12
sigma_K	0.1	0.065	0.009	0.002	98	100	POR_htoct12
betaX	0.1	0.054	0.006	0.006	76	96	BDX_htnov13
sigma_K	0.1	0.065	0.013	0.006	95	99	BDX_htnov13

# Save as a latex table for the Supplementary Information
print(xtable(tab, type = "latex",digits=3), 
      file = paste0("tables/100Simulations_Outputs.tex"), 
      include.rownames=FALSE)

The second model (in which the effect of the potential drivers applies on the mean of the within-population genetic variation) has wider and less precise estimates. So we keep the first model (in which the effect of the potential drivers applies on the median of the within-population genetic variation) for the following analyses and only report the outputs of this first model in the paper.

Evaluating model accuracy with simulated height data

Juliette Archambeau

29 novembre, 2022