Genomic offset predictions in Scots pine

With GF, RDA, pRDA, GDM and LFMM

Author

Juliette Archambeau

Published

April 23, 2025

1 Using all SNPs

Code

all_snp_names <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withoutMAF.rds"))[,-1] %>% colnames(.)

snp_sets <- list(all_snps = list(set_code = "all_snps",
                                 set_name = paste0("All SNPs (", length(all_snp_names),")"),
                                 set_snps = all_snp_names))

2 Climatic variables

Code

clim_var_avg_selected <- readRDS(here("data/ScotsPine/subdf.rds")) %>% pull(clim_var_avg) %>% unique()

clim_var_names <- tibble(code = clim_var_avg_selected,
                         name = c("Summer near-surface relative humidity (%)",
                                  "Summer precipitation flux (kg m-2 s-1)",
                                  "Summer surface downwelling shortwave radiation (W m-2)",
                                  "Winter wind speed (m s-1)",
                                  "Annual near-surface air temperature (K)"))

3 `scale_clim` function

Code

# Function to format climate data (mean-center the climatic variables and get datasets with climatic variables in columns)
scale_clim <- function(clim_data, em, rcp, ref_timeslice, pred_timeslice, clim_var_avg_selected, scale_only_ref = F){
  
  # Dataset with the climatic variables of the reference period (not scaled)
  
  ref_climdf <- clim_data %>% 
    filter(ensemble_member == em & 
           rcp_scenario == rcp & 
           timeslice == ref_timeslice & 
           clim_var_avg %in% clim_var_avg_selected) %>% 
    dplyr::select(-clim_var, -time_averages) %>% 
    pivot_wider(names_from = clim_var_avg, values_from = value)
  
  # Mean-center the climatic variables across the reference period
  ref_climdf_scaled <- ref_climdf %>%
    dplyr::mutate(across(any_of(clim_var_avg_selected), ~ (. - mean(.)) / sd(.)))
  
  if(scale_only_ref == T){
    ref_climdf_scaled
  } else {
  # Extract scaling parameters, i.e. mean and variance
  scale_params <- lapply(clim_var_avg_selected, function(x){
    
    vec_var <- ref_climdf[,x] %>% pull()
    
    list(mean = mean(vec_var),
         sd = sd(vec_var))
    
  }) %>% setNames(clim_var_avg_selected)
  
  
  
  
  # Dataset with the climatic variables across the future period (not scaled)
  pred_climdf <- clim_data %>% 
    filter(ensemble_member == em & 
           rcp_scenario == rcp & 
           timeslice == pred_timeslice & 
           clim_var_avg %in% clim_var_avg_selected) %>% 
    dplyr::select(-clim_var, -time_averages) %>% 
    pivot_wider(names_from = clim_var_avg, values_from = value)
  
  # Mean-center the climatic variables across the future period
  pred_climdf_scaled <- pred_climdf
  for(i in clim_var_avg_selected){
    pred_climdf_scaled[,i] <- (pred_climdf_scaled[,i] - scale_params[[i]]$mean) / scale_params[[i]]$sd
  }
  
  list(ref_climdf = ref_climdf, 
       pred_climdf = pred_climdf, 
       ref_climdf_scaled = ref_climdf_scaled, 
       pred_climdf_scaled = pred_climdf_scaled, 
       scale_params = scale_params)}
}

4 Gradient Forest (GF)

Code

# Predictor cumulative plots
make_predictor_cumulative_plot <- function(x){
  
  plot(x, plot.type="C",
       imp.vars=names(importance(x)),
       show.species=F,common.scale=T,
       cex.axis=1, 
       cex.lab=1.5,
       line.ylab=1,
       par.args=list(mgp=c(1.5, 0.5,0),mar=c(2.5,1,0.1,0.5),omi=c(0, +0.3,0,0)))}

# R2 measure of the fit of the random forest model for each species
make_performance_plot <- function(x, horizontal=FALSE){

    old.mar<-par()$mar
    par(mfrow=c(1,1),mar=old.mar+c(0,0,0,0))
    Ylab <- expression(R^2)

    perf <- importance(x, type="Species")
    n <- length(perf)
    if (horizontal)
      plot(perf, 1:n, las = 2, pch=19, axes=F, xlab="", ylab="")
    else
      plot(1:n, perf, las = 2, pch=19, axes=F, xlab="", ylab="")
    axis(labels= names(perf),side=1+horizontal,at=1:n, cex.axis=0.7, padj=0,las=2)
    axis(side=2-horizontal, cex.axis=1)
    mtext(Ylab,side=2-horizontal,line=2)
    title("Overall performance of random forests over loci")
    abline(h = 0, lty = 2)
    box()
    par(mar=old.mar)

}


# Species (alleles) cumulative plot
make_species_cumulative_plot <- function(x){
  
  plot(x,
     plot.type="C",imp.vars=names(importance(x)), 
     show.overall=F,legend=T,leg.posn="topleft", 
     leg.nspecies=10,
     cex.lab=1,
     cex.legend=1, 
     cex.axis=1,
     line.ylab=1,
     par.args=list(mgp=c(1.5, 0.5,0),mar=c(2.5,1,0.1,0.5),omi=c(0, +0.3,0,0)))
  }

Code

# Function to estimate the genomic offset with Gradient Forest
RCPs <- c("RCP2.6","RCP8.5")
EMs <- c("01","04","06","15")
ref_timeslice <- "1980-2000"
pred_timeslices <- c("2030-2050","2060-2080")
snp_sets <- snp_sets
clim_data <- readRDS(here("data/ScotsPine/subdf.rds")) 
geno_data <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withoutMAF.rds"))
clim_var_avg_selected <- unique(clim_data$clim_var_avg)

# # for the example
snp_set_i <- snp_sets[[1]]
rcp_i <- "RCP8.5"
em_i <- "15"
pred_timeslice_i <- "2030-2050"


df_GF_gen_off <- lapply(snp_sets, function(snp_set_i){
  
  # Subset genomic data for the set of selected SNPs
  sub_geno_data <- geno_data  %>% 
    dplyr::select(snp_set_i$set_snps) %>% 
    set_colnames(paste0("snp_",1:length(snp_set_i$set_snps))) %>% 
    as.data.frame() 
    
    lapply(EMs, function(em_i){
      
      # Mean - center the reference climatic data
      ref_clim_data <- scale_clim(clim_data = clim_data,
                                  em = em_i,
                                  rcp = RCPs[[1]],
                                  ref_timeslice = ref_timeslice,
                                  pred_timeslice = pred_timeslices[[1]],
                                  clim_var_avg_selected = clim_var_avg_selected,
                                  scale_only_ref = T)
      
      # dataframe formatted for GF
      # gf_data <- ref_clim_data %>% 
      #   dplyr::select(any_of(clim_var_avg_selected)) %>% 
      #   data.frame(sub_geno_data,.,check.names=F)
      # 
      # set.seed(3) # for reproducibility, as the results of GF models vary from one run to another
      # 
      # gf_mod <- gradientForest(gf_data,
      #                          predictor.vars=clim_var_avg_selected,
      #                          response.vars=colnames(sub_geno_data),
      #                          corr.threshold=0.5,
      #                          ntree=500,
      #                          trace=T)
      # 
      # saveRDS(gf_mod, file = here(paste0("outputs/ScotsPine/GF/models/",
      #                                    snp_set_i$set_code,"_em",
      #                                    em_i,"_refperiod_",
      #                                    gsub("\\-", "_", ref_timeslice),
      #                                    ".rds")))
      
      gf_mod <- readRDS(file=here(paste0("outputs/ScotsPine/GF/models/",
                                         snp_set_i$set_code,"_em",
                                         em_i,"_refperiod_",
                                         gsub("\\-", "_", ref_timeslice),
                                         ".rds")))
      
      #######
      # Plots
      #######
      
      # Overall importance plot
      # pdf(here(paste0("figs/ScotsPine/GF/overall_importance_plots/",
      #                 snp_set_i$set_code,"_em",
      #                 em_i,"_refperiod_",
      #                 gsub("\\-", "_", ref_timeslice),
      #                 ".pdf")), width=8,height=5)
      # plot(gf_mod, plot.type="Overall.Importance")
      # dev.off()
      
      # Species (allele) cumulative plot - do not plot if too many alleles
      # pdf(here(paste0("figs/ScotsPine/GF/allele_cumulative_plots/",
      #                 snp_set_i$set_code,"_em",
      #                 em_i,"_",
      #                 gsub("\\-", "_", ref_timeslice),
      #                 ".pdf")),width=7,height=7)
      # make_species_cumulative_plot(x=gf_mod)
      # dev.off()
      
      # Predictor cumulative plot
      # pdf(here(paste0("figs/ScotsPine/GF/predictor_cumulative_plots/",
      #                 snp_set_i$set_code,"_em",
      #                 em_i,"_",
      #                 gsub("\\-", "_", ref_timeslice),
      #                 ".pdf")), width=7,height=7)
      # make_predictor_cumulative_plot(x=gf_mod)
      # dev.off()
      
      # R2 measure of the fit of the random forest model for each species - do not run if many alleles
      # p <- tibble(snp=names(sort(gf_mod$result)),importance=sort(gf_mod$result)) %>% 
      #   ggplot() +
      #   geom_point(aes(y=reorder(snp, importance),x=importance)) +
      #   ylab("") +
      #   xlab(expression(R^2)) +
      #   theme_bw() 
      # 
      # ggsave(p, filename = here(paste0("figs/ScotsPine/GF/allele_importance_plots/",
      #                                  snp_set_i$set_code,"_em",
      #                                  em_i,"_",
      #                                  gsub("\\-", "_", ref_timeslice),
      #                                  ".pdf")), width=8, height=10)
      
      ####
      
      
      lapply(RCPs, function(rcp_i){
        
        lapply(pred_timeslices, function(pred_timeslice_i){
          
          # Mean - center the current and future climatic data
          clim_data_scaled <- scale_clim(clim_data = clim_data,
                                         em = em_i,
                                         rcp = rcp_i,  
                                         ref_timeslice = ref_timeslice,
                                         pred_timeslice = pred_timeslice_i,
                                         clim_var_avg_selected = clim_var_avg_selected)
          
          # Genomic offset predictions
          ref_pred <- predict(gf_mod) # predictions under current climates
          fut_pred <- predict(gf_mod,  
                              clim_data_scaled$pred_climdf_scaled %>% 
                                dplyr::select(any_of(clim_var_avg_selected)) %>% 
                                as.data.frame()) # predictions under future climates
          
          go <- lapply(1:nrow(ref_pred), function(x, ref_pred, fut_pred){
            as.numeric(pdist(ref_pred[x,],  
                             fut_pred[x,])@dist)},
            fut_pred=fut_pred, ref_pred=ref_pred) %>% 
            unlist()
          
          clim_data_scaled$pred_climdf_scaled %>% 
            dplyr::select(-any_of(clim_var_avg_selected)) %>% 
            dplyr::rename(pred_timeslice = timeslice) %>% 
            dplyr::mutate(var_code = snp_set_i$set_code,
                          var_name = snp_set_i$set_name,
                          method = "GF",
                          val = go)
          
          }) %>% bind_rows()
        }) %>% bind_rows()
      }) %>% bind_rows()
    }) %>% bind_rows()

saveRDS(df_GF_gen_off, here(paste0("outputs/ScotsPine/GF/df_gen_off_pred.rds")))

5 Redundancy Analysis (RDA) and partial RDA

Code

# Function to estimate the genomic offset with RDA
RCPs <- c("RCP2.6","RCP8.5")
EMs <- c("01","04","06","15")
ref_timeslice <- "1980-2000"
pred_timeslices <- c("2030-2050","2060-2080")
snp_sets <- snp_sets
clim_data <- readRDS(here("data/ScotsPine/subdf.rds")) 
geno_data <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withoutMAF.rds"))
clim_var_avg_selected <- unique(clim_data$clim_var_avg)
PCs <- readRDS(file = here("outputs/ScotsPine/PCA/PCs.rds")) %>% dplyr::select(PopulationCode, PC1, PC2)

df_RDA_gen_off <- lapply(snp_sets, function(snp_set_i){
  
  # Subset genomic data for the set of selected SNPs
  sub_geno_data <- geno_data  %>% 
    dplyr::select(any_of(snp_set_i$set_snps))
    
    lapply(EMs, function(em_i){
      
      # Mean - center the reference climatic data
      ref_clim_data <- scale_clim(clim_data = clim_data,
                                  em = em_i,
                                  rcp = RCPs[[1]],
                                  ref_timeslice = ref_timeslice,
                                  pred_timeslice = pred_timeslices[[1]],
                                  clim_var_avg_selected = clim_var_avg_selected,
                                  scale_only_ref = T)
      
      # RDA models not corrected for population structure (RDA)  
      form_rda_raw <- paste("sub_geno_data ~ ", paste(clim_var_avg_selected,collapse= " + "))
      rda_raw <- rda(as.formula(form_rda_raw), ref_clim_data[,clim_var_avg_selected])

      saveRDS(rda_raw, file = here(paste0("outputs/ScotsPine/RDA/models/rda_raw_",
                                          snp_set_i$set_code,"_em",
                                          em_i,"_refperiod_",
                                          gsub("\\-", "_", ref_timeslice),
                                          ".rds")))
      
      r2 <- RsquareAdj(rda_raw)
      eigenvalues <- summary(eigenvals(rda_raw, model = "constrained"))
      model_significance <- anova.cca(rda_raw, parallel=getOption("mc.cores")) # default is permutation=999
      axis_significance <- anova.cca(rda_raw, by="axis", parallel=getOption("mc.cores"))
      vif <- vif.cca(rda_raw)
      
      nb_signi_axis <- axis_significance %>% 
        as.data.frame() %>% 
        dplyr::rename(pvalue="Pr(>F)") %>% 
        dplyr::filter(pvalue<0.05) %>% 
        nrow()
      
      rda_models <- list(rda_raw = list(name = "RDA",
                                        form = form_rda_raw,
                                        mod = rda_raw,
                                        r2 = r2,
                                        eigenvalues = eigenvalues,
                                        model_significance = model_significance,
                                        axis_significance = axis_significance,
                                        vif = vif,
                                        nb_signi_axis = nb_signi_axis)) 
          
      
      # RDA models corrected for population structure (pRDA)
      form_rda_cor <- paste("sub_geno_data ~ ", paste(clim_var_avg_selected,collapse= " + "), " + Condition (PC1 + PC2)") 
      ref_clim_data_pcs <- ref_clim_data %>%
        left_join(PCs, by="PopulationCode") %>%
        .[,c(clim_var_avg_selected,colnames(PCs[,-1]))]
      rda_cor <- rda(as.formula(form_rda_cor), ref_clim_data_pcs)

      saveRDS(rda_cor, file = here(paste0("outputs/ScotsPine/RDA/models/rda_cor_",
                                          snp_set_i$set_code,"_em",
                                          em_i,"_refperiod_",
                                          gsub("\\-", "_", ref_timeslice),
                                          ".rds")))
      
      r2 <- RsquareAdj(rda_cor)
      eigenvalues <- summary(eigenvals(rda_cor, model = "constrained"))
      model_significance <- anova.cca(rda_cor, parallel=getOption("mc.cores")) # default is permutation=999
      axis_significance <- anova.cca(rda_cor, by="axis", parallel=getOption("mc.cores"))
      vif <- vif.cca(rda_cor)
      
      nb_signi_axis <- axis_significance %>% 
        as.data.frame() %>% 
        dplyr::rename(pvalue="Pr(>F)") %>% 
        dplyr::filter(pvalue<0.05) %>% 
        nrow()
      
      rda_models$rda_cor <-list(name = "pRDA",
                                form = form_rda_cor,
                                mod = rda_cor,
                                r2 = r2,
                                eigenvalues = eigenvalues,
                                model_significance = model_significance,
                                axis_significance = axis_significance,
                                vif = vif,
                                nb_signi_axis = nb_signi_axis)
      
       saveRDS(rda_models, file = here(paste0("outputs/ScotsPine/RDA/models/list_rda_models_",
                                              snp_set_i$set_code,"_em",
                                              em_i,"_refperiod_",
                                              gsub("\\-", "_", ref_timeslice),
                                              ".rds")))

       rda_models <- readRDS(file = here(paste0("outputs/ScotsPine/RDA/models/list_rda_models_",
                                              snp_set_i$set_code,"_em",
                                              em_i,"_refperiod_",
                                              gsub("\\-", "_", ref_timeslice),
                                              ".rds")))
       
      lapply(RCPs, function(rcp_i){
        
        lapply(pred_timeslices, function(pred_timeslice_i){
          
          # Mean - center the current and future climatic data
          clim_data_scaled <- scale_clim(clim_data = clim_data,
                                         em = em_i,
                                         rcp = rcp_i,  
                                         ref_timeslice = ref_timeslice,
                                         pred_timeslice = pred_timeslice_i,
                                         clim_var_avg_selected = clim_var_avg_selected)
          
          lapply(rda_models, function(rda_mod_i){
            
            if(rda_mod_i$nb_signi_axis > 0){ 
              
            # weights based on the variance explained by the different axes
            weights <- (rda_mod_i$mod$CCA$eig/sum(rda_mod_i$mod$CCA$eig))[0:rda_mod_i$nb_signi_axis]
            
            # We use the method "predict"
            if(rda_mod_i$name == "RDA"){
              
              clim_ref <- clim_data_scaled$ref_climdf_scaled[,clim_var_avg_selected]
              clim_pred <- clim_data_scaled$pred_climdf_scaled[,clim_var_avg_selected]
              
            } else if(rda_mod_i$name == "pRDA"){
              
              clim_ref <- clim_data_scaled$ref_climdf_scaled %>% 
                inner_join(PCs, by="PopulationCode") %>% 
                dplyr::select(any_of(clim_var_avg_selected), any_of(colnames(PCs[,-1])))
              clim_pred <- clim_data_scaled$pred_climdf_scaled %>% 
                inner_join(PCs, by="PopulationCode") %>% 
                dplyr::select(any_of(clim_var_avg_selected), any_of(colnames(PCs[,-1])))
            }
            
            pred_ref <- predict(rda_mod_i$mod, clim_ref, type = "lc")[,0:rda_mod_i$nb_signi_axis]
            pred_fut <- predict(rda_mod_i$mod, clim_pred, type = "lc")[,0:rda_mod_i$nb_signi_axis]
            
            
            if(!is.null(weights)){
              if(rda_mod_i$nb_signi_axis > 1){
              for(i in 1:rda_mod_i$nb_signi_axis){
                pred_ref[,i] <- pred_ref[,i] * weights[i]
                pred_fut[,i] <- pred_fut[,i] * weights[i]
                
                list_AI <- list(pred_ref,pred_fut)
                
                gen_off <- unlist(lapply(1:nrow(list_AI[[1]]), function(x) dist(rbind(list_AI[[1]][x,], list_AI[[2]][x,]), method = "euclidean")))
                
              }} else {
                pred_ref <- pred_ref * weights
                pred_fut <- pred_fut * weights
                
                list_AI <- list(pred_ref,pred_fut)
                
                gen_off <- unlist(lapply(1:length(list_AI[[1]]), function(x) dist(rbind(list_AI[[1]][[x]], list_AI[[2]][[x]]), method = "euclidean")))
              }}
            
            clim_data_scaled$pred_climdf_scaled %>% 
            dplyr::select(-any_of(clim_var_avg_selected)) %>% 
            dplyr::rename(pred_timeslice = timeslice) %>% 
            dplyr::mutate(var_code = snp_set_i$set_code,
                          var_name = snp_set_i$set_name,
                          val = gen_off,
                          method = rda_mod_i$name)
            }
            }) %>% bind_rows()
          }) %>% bind_rows()
        }) %>% bind_rows()
      }) %>% bind_rows()
    }) %>% bind_rows()

saveRDS(df_RDA_gen_off, here(paste0("outputs/ScotsPine/RDA/df_gen_off_pred.rds")))

6 Generalized Dissimilarity Modeling (GDM)

Code

# This script can only be run on modest SNP sets. For running on all SNPs, see next chunk.

# Loading allele counts without MAF
geno_data <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds"))

geno_data[geno_data ==1] <- 12
geno_data[geno_data ==2] <- 22
geno_data[geno_data ==0] <- 11


fst_matrices <- sapply(snp_sets, function(x){ 
  
  geno_data %>% 
    dplyr::select(PopulationCode,all_of(x$set_snps)) %>%
    as.data.frame() %>% 
    pairwise.WCfst(diploid=TRUE)
    
}, USE.NAMES = TRUE,simplify=FALSE)

# save the matrices
saveRDS(fst_matrices,file=here("outputs/ScotsPine/GDM/fst_matrices/fst_matrices.rds"))

Code

# Script to prepare the dataset to send to Santi, so that he computes the Fst matrix with all SNPs (for GDM)
geno_data <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds"))

geno_data[geno_data ==1] <- 12
geno_data[geno_data ==2] <- 22
geno_data[geno_data ==0] <- 11

geno_data <- geno_data %>% 
  dplyr::select(-FieldCode,-Family) %>%
  as.data.frame() %>% 
  arrange(PopulationCode)

#saveRDS(geno_data,file= here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/GenMatrix_FormattedForHierfstatPackage.rds"))
# geno_data <- readRDS(file= "data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/GenMatrix_FormattedForHierfstatPackage.rds")

# The matrix was run on the genotool cluster by Santi as my laptop crashed when computing it
pairwise.WCfst(geno_data, diploid=TRUE)

# Here is the matrix computed by Santi:
pairwise_fst <- read.delim(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/pairwise_fst.txt"), row.names=1)
  
# We put the matrix in a list so that it can be used in the following scripts.
fst_matrices <- list(all_snps = pairwise_fst)

Code

#fst_matrices <- readRDS(file=here("outputs/ScotsPine/GDM/fst_matrices/fst_matrices.rds"))

# Is there some negative values?
neg_vals <- sapply(fst_matrices, function(x){ length(x[which(x <0)])}, USE.NAMES = TRUE,simplify=FALSE)

# Counting the number of negative values
lapply(names(snp_sets), function(x){
  
  tibble('SNP set' = snp_sets[[x]]$set_name,
         'Nb of negative values' = neg_vals[[x]])
  
}) %>% 
  bind_rows() %>% 
  kable_mydf()
# No negative values in the Fst matrix with all SNPs

#### Scaling the matrices ###

# Following Fitzpatrick & Keller (2015): subtracting the minimum value and then dividing by the maximum value
# fst_matrices <- sapply(fst_matrices, function(x){
#   
#   (x-min(x,na.rm=T))/max(x,na.rm=T)
#   
# }, USE.NAMES = TRUE,simplify=FALSE)



fst_matrices <- sapply(fst_matrices, function(x){

 (x-min(x,na.rm=T))/(max(x,na.rm=T)-min(x,na.rm=T))

}, USE.NAMES = TRUE,simplify=FALSE)

# to check that it worked
# sapply(fst_matrices, function(x){range(x,na.rm=T)}, USE.NAMES = TRUE,simplify=FALSE)

source(here("scripts/functions/corpmat.R"))
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
p.mat <- corpmat(fst_matrices$all_snps %>% as.matrix())
p <- corrplot::corrplot(as.matrix(fst_matrices$all_snps %>% as.matrix()), method="color", col=col(200),
                        type="upper", order="hclust", 
                        addCoef.col = "black", # Add coefficient of correlation
                        tl.col="black", tl.srt=23, #Text label color and rotation
                        # Combine with significance
                        p.mat = p.mat, sig.level = 0.05, insig = "blank", number.cex =0.8,tl.cex = 0.8,
                        # hide correlation coefficient on the principal diagonal
                        diag=FALSE,
                        title = "Scaled Fst matrix with all SNPs",mar=c(0,0,2,0)) # add an upper margin to see the title
 
p 

# For GDM, the distance matrix must have as the first column the names of the populations
fst_matrices <- sapply(fst_matrices, function(x){
  
x <- x %>% 
    as.data.frame() %>% 
    tibble::rownames_to_column("PopulationCode") 
  
}, USE.NAMES = TRUE,simplify=FALSE)

#saveRDS(fst_matrices,file=here("outputs/ScotsPine/GDM/fst_matrices/scaled_fst_matrices.rds"))

Code

#fst_matrices <- readRDS(file=here("outputs/ScotsPine/GDM/fst_matrices/scaled_fst_matrices.rds"))

# Function to estimate the genomic offset with GDM
RCPs <- c("RCP2.6","RCP8.5")
EMs <- c("01","04","06","15")
ref_timeslice <- "1980-2000"
pred_timeslices <- c("2030-2050","2060-2080")
snp_sets <- snp_sets
clim_data <- readRDS(here("data/ScotsPine/subdf.rds")) 
clim_var_avg_selected <- unique(clim_data$clim_var_avg)


# for the example
# snp_set_i <- snp_sets[[1]]
# rcp_i <- "RCP8.5"
# em_i <- "01"
# pred_timeslice_i <- "2030-2050"
rm(rcp_i);rm(em_i);rm(pred_timeslice_i);rm(snp_set_i)

df_GDM_gen_off <- lapply(snp_sets, function(snp_set_i){
    
    lapply(EMs, function(em_i){
      
      # Mean - center the reference climatic data
      ref_clim_data <- clim_data %>%
        filter(ensemble_member == em_i &
                 rcp_scenario == RCPs[[1]] &
                 timeslice == ref_timeslice &
                 clim_var_avg %in% clim_var_avg_selected) %>%
        dplyr::select(-clim_var, -time_averages) %>%
        pivot_wider(names_from = clim_var_avg, values_from = value)

      # Steps:
      # extract the geographic coordinates of the populations
      # transform the coordinates in spatial points and specify the CRS (WGS84) with sp package
      # transform to a SpatVector object for the terra package
      # reproject with terra package in CRS EPSG:3035
      # extract population coordinates from the SpatVector with terra package
      
      pop_coord_proj <- ref_clim_data %>%
        dplyr::select(Longitude,Latitude) %>%
        sp::SpatialPoints(proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")) %>%
        terra::vect() %>%
        terra::project("EPSG:3035") %>%
        terra::crds() %>%
        as_tibble() %>%
        dplyr::rename(long_EPSG_3035=x, lat_EPSG_3035=y)
      
      # merge the population coordinates with the climatic variables
      clim_ref <- bind_cols(ref_clim_data[,c("PopulationCode",clim_var_avg_selected)],pop_coord_proj)
      
      # GDM models
      gdm_tab <- formatsitepair(bioData=fst_matrices[[snp_set_i$set_code]], 
                                bioFormat=3,
                                XColumn="long_EPSG_3035", 
                                YColumn="lat_EPSG_3035", 
                                predData=clim_ref, 
                                siteColumn="PopulationCode")  
      
      gdm_mod  <- gdm(data=gdm_tab, geo=TRUE)
      
      saveRDS(gdm_mod, file = here(paste0("outputs/ScotsPine/GDM/models/",
                                          snp_set_i$set_code,"_em",
                                          em_i,"_refperiod_",
                                          gsub("\\-", "_", ref_timeslice),
                                          ".rds")))
      
      
      # Plot predicted climatic distance vs observed genetic distance
      overlay_x <- seq( from=min(gdm_mod$ecological), to=max(gdm_mod$ecological), length=length(gdm_mod$ecological))
      overlay_y <- 1 - exp( - overlay_x )
      dfline <- tibble(x=overlay_x,y=overlay_y)  
        
      p <- tibble(clim=gdm_mod$ecological,obs=gdm_mod$observed) %>% 
        ggplot(aes(x=clim,y=obs)) + 
        geom_point(color=rgb(0,0,1,0.5)) +
        geom_line(data=dfline,aes(x=x,y=y), color="gray60") +
        xlim(c(0,1.7)) + 
        ylim(c(0,1)) + 
        xlab("Predicted climatic distance") +
        ylab("Observed genetic distance") +
        ggtitle(paste0(snp_set_i$set_name,"- Ensemble member ",em_i)) + 
        theme_bw() +
        theme(title = element_text(size=8)) 
        
        ggsave(filename = here(paste0("figs/ScotsPine/GDM/PredictedClimaticDistance_vs_ObservedGeneticDistance_Plots/",
                                  snp_set_i$set_code,"_em",em_i,".pdf")), p, width=5, height=5)
        
        # Predicted versus observed genetic distance
        p <- tibble(obs=gdm_mod$observed, pred=gdm_mod$predicted) %>% 
          ggplot(aes(x=obs,y=pred)) + 
          geom_abline(intercept = 0, slope = 1, color="gray60") +
          geom_point(color=rgb(0,0,1,0.5)) +
          xlim(c(0,1)) + ylim(c(0,1)) + 
          xlab("Observed genetic distance") +
          ylab("Predicted genetic distance") +
          ggtitle(paste0(snp_set_i$set_name,"- Ensemble member ",em_i)) + 
          theme_bw()+
          theme(title = element_text(size=8))
        
        ggsave(filename = here(paste0("figs/ScotsPine/GDM/Predicted_vs_ObservedGeneticDistance_Plots/",
                                  snp_set_i$set_code,"_em",em_i,".pdf")), p, width=5, height=5)
        
        # Ispine plots
        spline_data <- isplineExtract(gdm_mod)
        
        # extract variables with importance different from 0
        predictors <- which(apply(spline_data$y,2, sum) != 0) %>% names() 
        plots <- lapply(predictors, function(predictor){
          
          if(predictor=="Geographic"){
            predictor_name <- "Geography (km)"
            spline_data$x[,predictor] <- spline_data$x[,predictor] / 1000
            }  else {
              predictor_name <- clim_var_names[clim_var_names$code == predictor,] %>% pull(.)}
          
          tibble(x=spline_data$x[,predictor],
                 y=spline_data$y[,predictor]) %>% 
            ggplot() + 
            geom_line(aes(x=x,y=y),color="blue",linewidth=2) +
            ylim(c(0,1.3)) + 
            ylab("Partial genetic distance") +
            xlab(predictor_name) +
            theme(axis.title.x = element_text(size=9)) +
            theme_bw() 
          })
        
        # make title
        title <- ggdraw() + 
          draw_label(paste0(snp_set_i$set_name, " - Ensemble member ",em_i),
                     fontface = 'bold',
                     x = 0,
                     hjust = 0) +
          theme(plot.margin = margin(0, 0, 0, 7))
        
        # merge plots
        p <- plot_grid(plotlist=plots)
        
        # If we want to save the figures without title
        # ggsave(here(paste0("figs/ScotsPine/GDM/Ispline_Plots/",snp_set_i$set_code,"_em",em_i,"_noTitle.pdf")), p, width=14,height=9, device="pdf")
        
        # merge plots + title
        p <- plot_grid(title, p, ncol = 1,rel_heights = c(0.1, 1))
        
        # save the plots
        ggsave(here(paste0("figs/ScotsPine/GDM/Ispline_Plots/",snp_set_i$set_code,"_em",em_i,".pdf")), p, width=14,height=9, device="pdf")
        
  
  
  gdm_tab_pred <- clim_ref %>% 
    dplyr::select(-PopulationCode) %>% 
    dplyr::rename(xCoord=long_EPSG_3035,yCoord=lat_EPSG_3035) %>% 
    set_colnames(paste0("s1.",colnames(.)))

  
  
  lapply(RCPs, function(rcp_i){
        
        lapply(pred_timeslices, function(pred_timeslice_i){
          
           
          # Mean - center the current and future climatic data
          gdm_tab_pred <- clim_data %>%
            filter(ensemble_member == em_i &
                     rcp_scenario == rcp_i &
                     timeslice == pred_timeslice_i &
                     clim_var_avg %in% clim_var_avg_selected) %>%
            dplyr::select(-clim_var, -time_averages,-contains("ude")) %>%
            pivot_wider(names_from = clim_var_avg, values_from = value) %>%
            left_join(clim_ref[,c("PopulationCode","long_EPSG_3035","lat_EPSG_3035")], by="PopulationCode") %>%
            dplyr::select(contains("EPSG_3035"),any_of(clim_var_avg_selected)) %>%
            dplyr::rename(xCoord=long_EPSG_3035,yCoord=lat_EPSG_3035) %>%
            set_colnames(paste0("s2.",colnames(.))) %>%
            bind_cols(gdm_tab_pred, .) %>%
            mutate(distance=0,weights=0) %>%
            dplyr::select(distance,weights,contains("Coord"), everything())
          
            gen_off <- predict(gdm_mod,gdm_tab_pred)
            
            ref_clim_data %>% 
              dplyr::select(PopulationCode, Latitude, Longitude) %>% 
              mutate(pred_timeslice = pred_timeslice_i,
                     ensemble_member = em_i,
                     rcp_scenario = rcp_i) %>% 
              dplyr::mutate(var_code = snp_set_i$set_code,
                            var_name = snp_set_i$set_name,
                            method = "GDM",
                            val = gen_off)
 
          }) %>% bind_rows()
        }) %>% bind_rows()
      }) %>% bind_rows()
    }) %>% bind_rows()

saveRDS(df_GDM_gen_off, here(paste0("outputs/ScotsPine/GDM/df_gen_off_pred.rds")))

Genomic offset predictions using GDM are negatively correlated with the predictions from other methods. To ensure there are no errors in my analysis, I have verified the following:

Comparing predictions with geo=TRUE versus geo=FALSE (i.e., correction or not for geography).
Using population coordinates with either a geographic (spheroid) CRS (units in degrees longitude and latitude) or a projected (Cartesian, 2D) CRS (EPSG:3035).
Comparing predictions with and without mean-centered and scaled climatic data.
Exploring different standardizations of the Fst matrix.

However, none of these adjustments altered the results, which remain opposite to the predictions from other methods.

7 Latent Factor Mixed Model (LFMM)

Code

RCPs <- c("RCP2.6","RCP8.5")
EMs <- c("01","04","06","15")
ref_timeslice <- "1980-2000"
pred_timeslices <- c("2030-2050","2060-2080")
clim_data <- readRDS(here("data/ScotsPine/subdf.rds")) 
geno_data <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withoutMAF.rds")) # arrange by field code
clim_var_avg_selected <- unique(clim_data$clim_var_avg)
K <- c(1,4)


df_LFMM_gen_off <- lapply(snp_sets, function(snp_set_i){
  
  # # Subset genomic data for the set of selected SNPs
  # sub_geno_data <- geno_data  %>% 
  #   dplyr::select(any_of(snp_set_i$set_snps))
    
    lapply(EMs, function(em_i){
      
      lapply(RCPs, function(rcp_i){
        
        lapply(pred_timeslices, function(pred_timeslice_i){
          
          # Mean - center the current and future climatic data
          clim_data_scaled <- scale_clim(clim_data = clim_data,
                                         em = em_i,
                                         rcp = rcp_i,  
                                         ref_timeslice = ref_timeslice,
                                         pred_timeslice = pred_timeslice_i,
                                         clim_var_avg_selected = clim_var_avg_selected)
          
          # Dataset of reference climates
          env_ref <- geno_data %>% 
            dplyr::select(PopulationCode, Family, FieldCode) %>% 
            left_join(clim_data_scaled$ref_climdf_scaled %>% 
                        dplyr::select(PopulationCode,any_of(clim_var_avg_selected)),
                      by = "PopulationCode") %>%
            arrange(PopulationCode) %>% 
            dplyr::select(-PopulationCode, -Family, -FieldCode)
          
          # Dataset of prediction climates
          env_pred <- geno_data %>% 
            dplyr::select(PopulationCode, Family, FieldCode) %>% 
            left_join(clim_data_scaled$pred_climdf_scaled %>% 
                        dplyr::select(PopulationCode,any_of(clim_var_avg_selected)),
                      by = "PopulationCode") %>% 
            arrange(PopulationCode) %>% 
            dplyr::select(-PopulationCode, -Family, -FieldCode)
          
          # Genomic data - subset with selected SNPs
          geno_sub <- geno_data %>% 
            arrange(PopulationCode) %>% 
            dplyr::select(-PopulationCode, -Family, -FieldCode, any_of(snp_set_i$set_snps))
          
          lapply(K, function(k_i){
          
          # Estimating the genomic offset
          # ggap <- genetic.gap(input = geno_sub,
          #                     env = env_ref,
          #                     pred.env = env_pred,
          #                     K = k_i)
          # 
          # saveRDS(ggap, file = here(paste0("outputs/ScotsPine/LFMM/models/",
          #                                  snp_set_i$set_code,"_em",
          #                                  em_i,
          #                                  "_refperiod_",
          #                                  gsub("\\-", "_", ref_timeslice),
          #                                  "_predperiod_",
          #                                  gsub("\\-", "_", pred_timeslice_i),
          #                                  "_K",
          #                                  k_i,
          #                                  ".rds")))
          
          ggap <- readRDS(file = here(paste0("outputs/ScotsPine/LFMM/models/",
                                             snp_set_i$set_code,"_em",
                                             em_i,
                                             "_refperiod_",
                                             gsub("\\-", "_", ref_timeslice),
                                             "_predperiod_",
                                             gsub("\\-", "_", pred_timeslice_i),
                                             "_K",
                                             k_i,
                                             ".rds")))
      
          clim_data_scaled$pred_climdf_scaled %>% 
            dplyr::select(-any_of(clim_var_avg_selected)) %>% 
            dplyr::rename(pred_timeslice = timeslice) %>% 
            dplyr::mutate(var_code = snp_set_i$set_code,
                          var_name = snp_set_i$set_name,
                          method = paste0("LFMM_K",k_i),
                          val = unique(ggap$offset))
          
          }) %>% bind_rows()
          }) %>% bind_rows()
        }) %>% bind_rows()
      }) %>% bind_rows()
    }) %>% bind_rows()

saveRDS(df_LFMM_gen_off, here(paste0("outputs/ScotsPine/LFMM/df_gen_off_pred.rds")))

8 Climatic distances

Code

RCPs <- c("RCP2.6","RCP8.5")
EMs <- c("01","04","06","15")
ref_timeslice <- "1980-2000"
pred_timeslices <- c("2030-2050","2060-2080")
clim_data <- readRDS(here("data/ScotsPine/subdf.rds")) 
clim_var_avg_selected <- unique(clim_data$clim_var_avg)


df_clim_dist <- lapply(EMs, function(em_i){
  
  lapply(RCPs, function(rcp_i){
    
    lapply(pred_timeslices, function(pred_timeslice_i){
      
      
      # Mean - center the current and future climatic data
      clim_data_scaled <- scale_clim(clim_data = clim_data,
                                     em = em_i,
                                     rcp = rcp_i,  
                                     ref_timeslice = ref_timeslice,
                                     pred_timeslice = pred_timeslice_i,
                                     clim_var_avg_selected = clim_var_avg_selected)
      
      # Calculate absolute differences 
      df <- abs(clim_data_scaled$ref_climdf_scaled[,clim_var_avg_selected] - 
                  clim_data_scaled$pred_climdf_scaled[,clim_var_avg_selected]) %>% 
        as_tibble() %>% 
        bind_cols(clim_data_scaled$pred_climdf_scaled %>% 
                    dplyr::select(PopulationCode, Latitude, Longitude, ensemble_member, rcp_scenario, timeslice),.) %>% 
        dplyr::rename(pred_timeslice = timeslice) %>% 
        mutate(eucli = unique(sqrt(rowSums((clim_data_scaled$ref_climdf_scaled[,clim_var_avg_selected] -
                                              clim_data_scaled$pred_climdf_scaled[,clim_var_avg_selected])^2))))
      }) %>% bind_rows()
    }) %>% bind_rows()
  }) %>% bind_rows() %>% 
  
  mutate(ensemble_member = paste0("EM ",ensemble_member))


# We calculate the climatic distances for the average across ensemble members
df_clim_dist <- df_clim_dist %>%
  #group_by(PopulationCode, rcp_scenario,pred_timeslice) %>%
  summarize(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)), .by=c("PopulationCode", "rcp_scenario", "pred_timeslice")) %>% 
  mutate(ensemble_member="Average across EMs") %>% 
  bind_rows(df_clim_dist) %>% 
  pivot_longer(cols = all_of(c(clim_var_avg_selected,"eucli")), names_to = "var_code", values_to = "val") %>% 
  left_join(clim_var_names %>% 
              bind_rows(tibble(code = "eucli", name= "All climatic variables (Euclidean distance)")) %>% 
              dplyr::rename(var_code = code, var_name = name)) %>% 
  mutate(method = "CD")

Joining with `by = join_by(var_code)`

9 Merging climatic distances and genomic offset predictions

Code

df_go <- readRDS(here("outputs/ScotsPine/GF/df_gen_off_pred.rds")) %>% 
  bind_rows(readRDS(here("outputs/ScotsPine/LFMM/df_gen_off_pred.rds"))) %>% 
  bind_rows(readRDS(here("outputs/ScotsPine/RDA/df_gen_off_pred.rds"))) %>% 
  bind_rows(readRDS(here("outputs/ScotsPine/GDM/df_gen_off_pred.rds"))) %>% 
  mutate(ensemble_member = paste0("EM ",ensemble_member))

df_go_clim_dist <- df_go %>% 
  summarize(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)), .by=c("PopulationCode", 
                                                                         "rcp_scenario", 
                                                                         "pred_timeslice",
                                                                         "var_name",
                                                                         "var_code",
                                                                         "method")) %>% 
  mutate(ensemble_member="Average across EMs") %>% 
  bind_rows(df_go) %>% 
  bind_rows(df_clim_dist) %>% 
  group_by(ensemble_member, rcp_scenario, pred_timeslice, var_name, method) %>%
  mutate(pop_rank = rank(-val, ties.method = "min")) %>%
  ungroup() %>% 
  mutate(method_var_name = paste0(method, " - ", var_name))

saveRDS(df_go_clim_dist, here("outputs/ScotsPine/df_gen_off_pred_clim_dist.rds"))
saveRDS(df_go_clim_dist, here("shiny/VizGenomicOffsetPredictionsVariabilityScotsPine/df_gen_off_pred_clim_dist.rds"))

Climatic distances and genomic offset predictions can be visualized on the following Shiny app: https://juliettearchambeau.shinyapps.io/VizGenomicOffsetPredictionsVariabilityScotsPine/

1 Using all SNPs

2 Climatic variables

3 scale_clim function

4 Gradient Forest (GF)

5 Redundancy Analysis (RDA) and partial RDA

6 Generalized Dissimilarity Modeling (GDM)

7 Latent Factor Mixed Model (LFMM)

8 Climatic distances

9 Merging climatic distances and genomic offset predictions

3 `scale_clim` function