1 Introduction and model equations

In this document, we simulate some data to test the model ability to accurately estimate two parameters of interest: the coefficient \(\beta_X\) of the potential drivers of the within-population genetic variation (i.e. variation among clones) and the standard deviation \(\sigma_K\) of the within-population genetic variation.

We tried two models: one with the effect of the potential drivers applying on the mean of the within-population genetic variation, the other on the median. Here are the two model equations.

Model 1 (effect on the median)

\[\begin{equation} \begin{aligned} y_{bpcr} & \sim \mathcal{N}(\mu_{bpc},\sigma^{2}_{r})\\[3pt] \mu_{bpc} & = \beta_{0} + B_{b} + P_{p} + C_{c(p)}\\[3pt] \beta_{0} & \sim \mathcal{N}(\mu_{y},2)\\[3pt] \begin{bmatrix} B_{b} \\ P_{p} \end{bmatrix} &\sim \mathcal{N}\left(0, \begin{bmatrix} \sigma^{2}_{B}\\[3pt] \sigma^{2}_{P}\\ \end{bmatrix} \right)\\[3pt] C_{c(p)} & \sim \mathcal{N}(0,\sigma^{2}_{C_{p}})\\[3pt] \overline{\sigma_{C_{p}}^{2}} & = \frac{\sum_{p=1}^{33} \sigma_{C_{p}}^{2}}{33} \\[3pt] \sigma_{tot}^{2} & = \sigma_{r}^{2} + \sigma_{B}^{2} + \overline{\sigma_{C_{p}}^{2}} + \sigma_{P}^{2}\\[3pt] \sigma_{r} & = \sigma_{tot} \times \sqrt(\pi_{1})\\[3pt] \sigma_{B} & = \sigma_{tot} \times \sqrt(\pi_{2})\\[3pt] \sigma_{P} & = \sigma_{tot} \times \sqrt(\pi_{3})\\[3pt] \sigma_{C_{p}} & \sim \mathcal{LN}\left(\ln\left(\sigma_{tot} \times \sqrt(\pi_{4})\right)-\frac{\sigma^{2}_{K}}{2} + \beta_{x}X_{p},\sigma^{2}_{K}\right)\\[3pt] \sigma_{K} & \sim \exp(1)\\[3pt] \sigma_{tot} & \sim \mathcal{S}^{*}(0,1,3) \end{aligned} \end{equation}\]

Model 2 (effect on the mean)

\[\begin{equation} \begin{aligned} y_{bpcr} & \sim \mathcal{N}(\mu_{bpc},\sigma^{2}_{r})\\[3pt] \mu_{bpc} & = \beta_{0} + B_{b} + P_{p} + C_{c(p)}\\[3pt] \beta_{0} & \sim \mathcal{N}(\mu_{y},2)\\[3pt] \begin{bmatrix} B_{b} \\ P_{p} \end{bmatrix} &\sim \mathcal{N}\left(0, \begin{bmatrix} \sigma^{2}_{B}\\[3pt] \sigma^{2}_{P}\\ \end{bmatrix} \right)\\[3pt] C_{c(p)} & \sim \mathcal{N}(0,\sigma^{2}_{C_{p}})\\[3pt] \overline{\sigma_{C_{p}}^{2}} & = \frac{\sum_{p=1}^{33} \sigma_{C_{p}}^{2}}{33} \\[3pt] \sigma_{tot}^{2} & = \sigma_{r}^{2} + \sigma_{B}^{2} + \overline{\sigma_{C_{p}}^{2}} + \sigma_{P}^{2}\\[3pt] \sigma_{r} & = \sigma_{tot} \times \sqrt(\pi_{1})\\[3pt] \sigma_{B} & = \sigma_{tot} \times \sqrt(\pi_{2})\\[3pt] \sigma_{P} & = \sigma_{tot} \times \sqrt(\pi_{3})\\[3pt] \sigma_{C_{p}} & \sim \mathcal{LN}\left(\ln\left(\sigma_{tot} \times \sqrt(\pi_{4}) + \beta_{x}X_{p} \right)-\frac{\sigma^{2}_{K}}{2},\sigma^{2}_{K}\right)\\[3pt] \sigma_{K} & \sim \exp(1)\\[3pt] \sigma_{tot} & \sim \mathcal{S}^{*}(0,1,3) \end{aligned} \end{equation}\] Comments for the log-normal: \[\begin{equation} \begin{aligned} \sigma_{C_{p}} \sim \mathcal{LN}(\mu,\sigma^{2}) & \Leftrightarrow \ln(\sigma_{C_{p}}) \sim \mathcal{N}(\mu,\sigma^{2})\\[3pt] \text{By definition: } \mathbb{E}(\sigma_{C_{p}}) & = \exp\left(\mu + \frac{\sigma^{2}}{2}\right)\\[3pt] \text{We want: } \mathbb{E}(\sigma_{C_{p}}) & = \exp\left(\mu + \frac{\sigma^{2}}{2}\right) = \sigma_{tot} \times \sqrt(\pi_{4})\\[3pt] \text{Therefore: } \mu & = \ln\left(\sigma_{tot} \times \sqrt(\pi_{4})\right) - \frac{\sigma^{2}}{2}\\ \end{aligned} \end{equation}\]

2 Parameters and functions to simulate the data

2.1 Parameters and options

traits <- c("AST_htdec11", # Asturias, Spain - November 2012
            "POR_htoct12", # Fundao, Portugal - October 2012
            "BDX_htnov13") # Bordeaux, France - November 2013

# Size of the simulated dataset
nb.clones = "real-based"
#nb.clones = 20 # 20 clones per population  (5280 trees, 33 populations, 20 clones per population) 


# Which model? 
which.model <- "model1"


# True values of the parameters
##############################

# betaX
if(which.model=="model1") {betaX <- 0.1} else if(which.model=="model2"){betaX <- 1.2}

# Variance of the log-normal (distribution of the within-population genetic variation)
sigma_K <- 0.1

# Variance partitioning
pi1 <- 0.5 # prop of variance explained by residuals
pi2 <- 0.05 # prop of variance explained by block effects
pi3 <- 0.15 # prop of variance explained by the provenance
pi4 <- 0.3 # prop of variance explained by the clones effects


# MCMC parameters
#################

# Options for model sampling
n_chains <- 4
n_iter <- 2500
n_warm <- 1250
n_thin <- 1
save_warmup = FALSE 

# option for the MCMC intervals
point.est="median"
prob.in <- 0.9
prob.out <- 0.95
conf.level <- 0.95

2.1.1 Functions

2.1.1.1 SimulData()

#### Function to simulate the data ####
SimulData <- function(trait,nb.set.seed){
set.seed(nb.set.seed)

# As we want to base our simulations on the real experimental design, 
  # we load the trait data and extract the number of block, populations,
  # clones per population and trees per clone: 
df <- readRDS(file="data/ClonapinData/PhenoDataNovember2019_AnnualTraits.rds") %>% 
  dplyr::filter(!(prov=="MAD")) %>% 
  dplyr::rename(trait=all_of(trait)) %>% 
  dplyr::select(prov,clon,tree,block,trait) %>% 
  drop_na() %>% 
  arrange(by=tree)

# We keep the same trait mean as the real data:
beta0 <- mean(df$trait)

# Same for the standard deviation and variance of the trait:
sigma_tot <- sqrt(var(df$trait))
sigma2_tot <- var(df$trait)


# If we don't want to base our simulations on the real experimental design, 
  # we have to run these lines: we create a perfectly balanced dataset of 33 populations, 
  # 20 clones per populations and 1 clone per block
if(nb.clones==20){
  
vec.prov <- vector()
for(i in unique(df$prov)){vec.prov <- c(vec.prov,rep(i,nb.clones*8))}

vec.blocks <- vector()
for(i in 1:8){vec.blocks <- c(vec.blocks,rep(i,nb.clones))}

df <- tibble(prov=vec.prov,
             clon=paste0(prov,rep(1:nb.clones,264)),
             block=rep(vec.blocks,33),
             tree=paste0(clon,"_",block))
}


# We want: Var(Pop) + Var(Block) + Var(Clone) + Var(Residual) = Var(total)
sigma_prov <-  sqrt(sigma2_tot*pi3) # Among populations variation explains 15% of the total variance
sigma_block <-  sqrt(sigma2_tot*pi2) # Among blocks variation explains 5% of the total variance
sigma_r <- sqrt(sigma2_tot*pi1) # Residuals explain 50% of the total variance
mean_sigma_clon <- sqrt(sigma2_tot*pi4) # Among clones variation explains 30% of the total variance 

if(which.model=="model1"){  # Model 1
# In the first model, the effect of the potential drivers is 
  # on the median of the within-population genetic variation
 
Xp <- rnorm(length(unique(df$prov)),mean=0,sd=1) # Potential driver that is mean-centered
mu_sigma_clon <- log(mean_sigma_clon) - (sigma_K*sigma_K)/2  + betaX*Xp
log_sigma_clon <-  rnorm(length(unique(df$prov)),mu_sigma_clon, sigma_K )
sigma_clon <- exp(log_sigma_clon)

} else if(which.model=="model2"){ # Model 2
# In the second model, the effect of the potential drivers is
  # on the mean of the within-population genetic variation

Xp <- rnorm(length(unique(df$prov)),mean=0,sd=1)
mu_sigma_clon <- log(mean_sigma_clon + betaX*Xp) - (sigma_K*sigma_K)/2 
log_sigma_clon <-  rnorm(length(unique(df$prov)),mu_sigma_clon, sigma_K)
sigma_clon <- exp(log_sigma_clon)
}

df <- df %>% 
  inner_join(tibble(prov=unique(df$prov), # Population id
             sigma_clon= sigma_clon, # within-population genetic variation (among glones variation)
             Xp=Xp, # Potential driver of the within-population genetic variation
             alpha_prov=rnorm(length(unique(df$prov)),mean=0,sd=sigma_prov)), # population varying intercepts
             by="prov") 

sub <- df %>% 
  dplyr::select(clon,alpha_prov,sigma_clon) %>% 
  distinct()

df <- df %>% 
  inner_join(tibble(clon=sub$clon, # clone id
             alpha_clon=rnorm(length(sub$clon),mean=0,sd=sub$sigma_clon)), # clone varying intercepts
             by="clon") %>% 
  inner_join(tibble(block=unique(df$block), # block id
             alpha_block=rnorm(length(unique(df$block)),mean=0,sd=sigma_block)), # block varying intercepts
             by="block") %>% 
  dplyr::mutate(mu=beta0 + alpha_prov + alpha_block + alpha_clon, # linear predictor
                ysim=rnorm(length(df$tree),mu,sigma_r)) # trait values

list.sim <- list(df=df,
             beta0 =beta0,
             betaX=betaX,
             sigma_tot=sigma_tot,
             sigma_r=sigma_r,
             sigma_K =sigma_K,
             sigma_prov=sigma_prov,
             mean_sigma_clon =mean_sigma_clon ,
             mu_sigma_clon=mu_sigma_clon,
             Xp=Xp,
             sigma_block=sigma_block,
             pi2=(sigma_block*sigma_block)/(sigma_tot*sigma_tot),
             pi3=(sigma_prov*sigma_prov)/(sigma_tot*sigma_tot),
             pi4=mean(sigma_clon)*mean(sigma_clon)/(sigma_tot*sigma_tot),
             pi1=(sigma_r*sigma_r)/(sigma_tot*sigma_tot))
}

2.1.1.2 PlotParam()

PlotParam <- function(x)  {
  
  model.draws <- readRDS(file=paste0("outputs/simulations/OneSimulation/",x,".rds")) %>% 
    as.matrix() %>% 
    as_tibble() %>% 
    dplyr::select(all_of(params))
  
  list.sim <- readRDS(file=paste0("outputs/simulations/OneSimulation/",x,"_ListSim.rds"))
  if(grepl("pi",params[[1]])==TRUE){sim.true.values <- list.sim[c("pi1","pi2","pi3","pi4")] %>% unlist()} else 
    {sim.true.values <- list.sim[params] %>% unlist()}
  
  mcmc_recover_intervals(model.draws,
                            sim.true.values,
                            prob = prob.in,
                            prob_outer = prob.out,
                            point_est=point.est) + 
  theme(axis.text = element_text(size=16))+
  labs(subtitle="")+
  scale_x_discrete(labels=param.labels) +
       theme(legend.position = legend.position,
             legend.background = element_rect(colour = "grey"),
             legend.title = element_blank(),
             axis.text.x=element_text(size=18))
}

3 One simulation

3.1 Data simulation + running the model

# set.seed of the simulations
nb.set.seed <- 44

for(trait in traits){
# Simulate the data
list.sim <- SimulData(trait,nb.set.seed)

list.stan <- list(N=length(list.sim$df$tree),
             y=list.sim$df$ysim,
             X=list.sim$Xp,
             nprov = length(unique(list.sim$df$prov)),
             nclon = length(unique(list.sim$df$clon)),
             nblock = length(unique(list.sim$df$block)),
             prov = as.numeric(as.factor(list.sim$df$prov)),
             which_prov = as.numeric(as.factor(pull(unique(list.sim$df[c("prov","clon")])[,"prov"]))),
             clon = as.numeric(as.factor(list.sim$df$clon)),
             bloc = as.numeric(as.factor(list.sim$df$block)),
             sigma_K = 0.1)

if(which.model=="model1") {
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate.stan")
} else if(which.model=="model2"){
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate_NotWorking.stan")}

fit.model <- sampling(model, 
                      data = list.stan, 
                      pars=c("beta0","betaX",
                             "pi",
                             "sigma_tot","sigma_block","sigma_prov","mean_sigma_clon","sigma_clon","sigma_K","sigma_r"),
                      iter = n_iter, 
                      chains = n_chains, 
                      cores = n_chains,
                      save_warmup= save_warmup,
                      thin=n_thin) 

saveRDS(fit.model,file=paste0("outputs/simulations/OneSimulation/",trait,".rds"))
saveRDS(list.sim,file=paste0("outputs/simulations/OneSimulation/",trait,"_ListSim.rds"))}

3.2 SI figures

3.2.1 Standard Deviations

params  <- c("sigma_tot","sigma_r","sigma_block","sigma_prov","mean_sigma_clon")
param.labels <- c(parse(text = TeX("$mean(\\sigma_{C_{p}})$")),
                  parse(text = TeX("$\\sigma_{B}$")),
                  parse(text = TeX("$\\sigma_{P}$")),
                  parse(text = TeX("$\\sigma_{r}$")),
                  parse(text = TeX("$\\sigma_{tot}$")))
legend.position <- c(0.2,0.9)



plots <- mclapply(traits,PlotParam)
plots[[2]] <-  plots[[2]] + theme(legend.position = "none")
plots[[3]] <-  plots[[3]] + theme(legend.position = "none")


p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('Asturias (21 months)','Fundão (20 months)',"Bordeaux (25 months)"),
               label_size = 20,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_StandardDeviations.png"),height=8,width=17)
p

3.2.2 Variance partitioning

params  <- c("pi[1]","pi[2]","pi[3]","pi[4]")
param.labels <- c(parse(text = TeX("$\\pi_{r}$")),
                  parse(text = TeX("$\\pi_{B}$")),
                  parse(text = TeX("$\\pi_{P}$")),
                  parse(text = TeX("$\\pi_{C}$")))
legend.position <- c(0.8,0.9)


plots <- mclapply(traits,PlotParam)
plots[[2]] <-  plots[[2]] + theme(legend.position = "none")
plots[[3]] <-  plots[[3]] + theme(legend.position = "none")


p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('Asturias (21 months)','Fundão (20 months)',"Bordeaux (25 months)"),
               label_size = 20,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_VariancePartitioning.png"),height=8,width=17)
p

3.2.3 Sigma_k and beta_X

params  <- c("sigma_K","betaX")
param.labels <- c(parse(text = TeX("$\\beta_{X}$")),
                  parse(text = TeX("$\\sigma_{K}$")))
legend.position <- c(0.3,0.85)


plots <- mclapply(traits,PlotParam)
plots[[2]] <-  plots[[2]] + theme(legend.position = "none")
plots[[3]] <-  plots[[3]] + theme(legend.position = "none")


p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('Asturias (21 months)','Fundão (20 months)',"Bordeaux (25 months)"),
               label_size = 20,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_SigmaKBetaX.png"),height=6,width=17)
p

3.2.4 Correlations among sigma_k and beta_X

plots <- mclapply(traits,function(x){
  
fit.model <- readRDS(file=paste0("outputs/simulations/OneSimulation/",x,".rds"))
np <- nuts_params(fit.model)
p <- mcmc_pairs(as.array(fit.model), 
           np = np,
           pars = c("sigma_K","betaX"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=3, div_shape = 19),
           max_treedepth =10)
  
})

p <- plot_grid(plots[[1]],plots[[2]],plots[[3]],
               labels = c('A)','B)',"C)"),
               label_size = 20,vjust=1.8,
               nrow=1)
ggsave(p,file=paste0("figs/Simulations/OneSimulation_CorrelationsSigmaKBetaX.png"),height=6,width=17)
p

3.2.5 Old figures

4 100 Simulations

4.1 Running 100 models

set.seed(23)

nb.set.seed <- sample(1:1000,100,replace=FALSE)

sim <- lapply(nb.set.seed,function(x){

  list.sim <- SimulData(trait=trait,nb.set.seed=x)
  
  list.stan <- list(N=length(list.sim$df$tree),
             y=list.sim$df$ysim,
             X=list.sim$Xp,
             nprov = length(unique(list.sim$df$prov)),
             nclon = length(unique(list.sim$df$clon)),
             nblock = length(unique(list.sim$df$block)),
             prov = as.numeric(as.factor(list.sim$df$prov)),
             which_prov = as.numeric(as.factor(pull(unique(list.sim$df[c("prov","clon")])[,"prov"]))),
             clon = as.numeric(as.factor(list.sim$df$clon)),
             bloc = as.numeric(as.factor(list.sim$df$block)),
             sigma_K = sigma_K)
  
  if(which.model=="model1") {
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate.stan")
} else if(which.model=="model2"){
  model = stan_model("scripts/StanModels/HierarchicalModel_VaryingInterClonesSD_OneCovariate_NotWorking.stan")}
  
  fit.model <- sampling(model, 
                        data = list.stan, 
                        pars=c("betaX","sigma_K"), 
                        iter = n_iter, 
                        chains = n_chains, 
                        cores = n_chains,
                        save_warmup = save_warmup,
                        thin=n_thin) 
  
   conf95 <- broom::tidyMCMC(fit.model,pars=(c("betaX","sigma_K")),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T,conf.level = 0.95) %>% 
     dplyr::rename(conf.low.95=conf.low,
                   conf.high.95=conf.high,
                   median=estimate)
   
   broom::tidyMCMC(fit.model,pars=(c("betaX","sigma_K")),
                droppars = NULL, estimate.method = "mean", ess = F, rhat = F, conf.int = T,conf.level = 0.80)%>% 
     dplyr::rename(conf.low.80=conf.low,
                   conf.high.80=conf.high,
                   mean=estimate) %>% 
     inner_join(conf95,by=c("term","std.error")) %>% 
     dplyr::select(term,mean,median,std.error,conf.low.80,conf.high.80,conf.low.95,conf.high.95, rhat ,  ess)
  
})

saveRDS(sim,file=paste0("outputs/simulations/100sim_",which.model,"_",trait,".rds"))

4.2 SI table

tab <-   lapply(traits, function(trait){

# betaX
sim <- readRDS(file=paste0("outputs/simulations/100sim_model1_",trait,".rds")) %>% 
  bind_rows() %>% 
  filter(term=="betaX") %>% 
  mutate(bias.mean=mean-betaX,
         bias.median=median-betaX,
         coverage.conf.80=case_when(
           betaX > conf.low.80  & betaX < conf.high.80 ~ 1,
           betaX < conf.low.80 ~ 0,
           betaX > conf.high.80 ~ 0),
         coverage.conf.95=case_when(
           betaX > conf.low.95  & betaX < conf.high.95 ~ 1,
           betaX < conf.low.95 ~ 0,
           betaX > conf.high.95 ~ 0))

tib <- tibble(term="betaX",
       true.value=betaX,
       mean.sdt.error=mean(sim$std.error),
       mean.bias.mean=mean(sim$bias.mean),
       mean.bias.median=mean(sim$bias.median),
       coverage.conf.80=sum(sim$coverage.conf.80),
       coverage.conf.95=sum(sim$coverage.conf.95))

# sigma_k
sim <- readRDS(file=paste0("outputs/simulations/100sim_model1_",trait,".rds")) %>% 
  bind_rows() %>% 
  filter(term=="sigma_K") %>% 
  mutate(bias.mean=mean-sigma_K,
         bias.median=median-sigma_K,
         coverage.conf.80=case_when(
           sigma_K > conf.low.80  & sigma_K < conf.high.80 ~ 1,
           sigma_K < conf.low.80 ~ 0,
           sigma_K > conf.high.80 ~ 0),
         coverage.conf.95=case_when(
           sigma_K > conf.low.95  & sigma_K < conf.high.95 ~ 1,
           sigma_K < conf.low.95 ~ 0,
           sigma_K > conf.high.95 ~ 0))

tib <- tib %>% 
  bind_rows(tibble(term="sigma_K",
       true.value=sigma_K,
       mean.sdt.error=mean(sim$std.error),
       mean.bias.mean=mean(sim$bias.mean),
       mean.bias.median=mean(sim$bias.median),
       coverage.conf.80=sum(sim$coverage.conf.80),
       coverage.conf.95=sum(sim$coverage.conf.95))) %>% 
  mutate(Traits=trait)
}) %>% 
  bind_rows()

tab %>% 
  kable(digits=3) %>% 
  kable_styling(latex_options =c("striped", "hold_position"))
term true.value mean.sdt.error mean.bias.mean mean.bias.median coverage.conf.80 coverage.conf.95 Traits
betaX 0.1 0.049 -0.004 -0.004 81 96 AST_htdec11
sigma_K 0.1 0.060 0.006 0.001 93 99 AST_htdec11
betaX 0.1 0.053 -0.003 -0.003 82 99 POR_htoct12
sigma_K 0.1 0.065 0.009 0.002 98 100 POR_htoct12
betaX 0.1 0.054 0.006 0.006 76 96 BDX_htnov13
sigma_K 0.1 0.065 0.013 0.006 95 99 BDX_htnov13
# Save as a latex table for the Supplementary Information
print(xtable(tab, type = "latex",digits=3), 
      file = paste0("tables/100Simulations_Outputs.tex"), 
      include.rownames=FALSE)

The second model (in which the effect of the potential drivers applies on the mean of the within-population genetic variation) has wider and less precise estimates. So we keep the first model (in which the effect of the potential drivers applies on the median of the within-population genetic variation) for the following analyses and only report the outputs of this first model in the paper.