Genomic offset predictions in Scots pine with Redundancy Analysis

Author

Juliette Archambeau

Published

June 7, 2024

Aim : identify potential candidate SNPs for adaptation to climate and predict the genomic offset under future climates using the Redundancy Analysis.

1 The data

1.1 Population allele frequencies

Calculated in FormattingScotsPineGenomicData report.

Code

# We load the population allele frequencies filtered for MAF
geno <- readRDS(here("data/ScotsPine/GenomicData/FormattedData/FilteredGenomicData_noMAF_PopAlleleFreq.rds")) %>% 
  arrange(PopulationCode) %>% # to be sure that colors in the PCA graph are attributed to the right individuals
  column_to_rownames("PopulationCode")

geno[1:10,1:10] %>% kable_mydf()

	AX-117395395	AX-117395404	AX-117395409	AX-117395425	AX-117395439	AX-117395441	AX-117395444	AX-117395445	AX-117395460	AX-117395482
AB	0.23	0.48	0.08	0.37	0.51	0.45	0.08	0.44	0.05	0.03
AC	0.40	0.43	0.05	0.30	0.34	0.35	0.27	0.45	0.09	0.04
AM	0.30	0.45	0.08	0.51	0.37	0.39	0.14	0.41	0.05	0.02
BB	0.48	0.32	0.19	0.32	0.54	0.45	0.11	0.50	0.13	0.07
BE	0.44	0.61	0.06	0.51	0.39	0.40	0.31	0.41	0.12	0.04
BW	0.41	0.31	0.08	0.41	0.45	0.32	0.25	0.37	0.07	0.02
CC	0.38	0.40	0.25	0.33	0.45	0.52	0.30	0.38	0.01	0.04
CG	0.39	0.57	0.08	0.51	0.54	0.42	0.10	0.52	0.10	0.01
CR	0.32	0.58	0.03	0.26	0.65	0.52	0.32	0.51	0.09	0.02
GA	0.45	0.36	0.03	0.39	0.30	0.41	0.19	0.45	0.08	0.04

1.2 Population structure

Principal components calculated with population allele frequencies not filtered for MAF (see PopulationGeneticStructureAndVariancePartioningScotsPine report). We use the first three PCs of a PCA to account for population structure in the pRDA.

Code

PCs <- readRDS(file = here("outputs/ScotsPine/PCA/PCs.rds"))

1.3 Climatic variables

Selected climatic variables (see ExploreChessCapeClimaticData report) from the CHESS-SCAPE climatic data.

Code

climdf <- readRDS(here("data/ScotsPine/subdf.rds")) %>% 
  dplyr::filter(timeslice == "1980-2000") %>% 
  dplyr::select(-rcp_scenario) %>% 
  dplyr::distinct()

# selected climatic variables
clim_var_avg_selected <- unique(climdf$clim_var_avg)

Dataset with 420 rows including:

21 populations
4 ensemble members
5 climatic variables.

2 Identifying outlier SNPs

We identify outlier SNPs with a RDA not correcting for population structure and a pRDA correcting for population structure (using the three PCS of a PCA).

We use climatic data of the 1980-2000 timeslice and the ensemble member 01 (used as a control in the CHESS-SCAPE climatic data).

2.0.1 RDA models

RDA summary statistics are summarized in the pdfs RDAsummary.pdf and the RDA plots can be found in the pdfs RDAplots.pdf.

Code

# =========
# FUNCTIONS
# =========


# Function to create a S3 object with the RDA model and some info
# ===============================================================
new_RDA <- function(form_rda,selected_var,PC_names,mod_rda,r2,eigenvalues,model_significance,axis_significance,vif){
  
  structure(
    .Data = list(form_rda=form_rda,
                 selected_var=selected_var,
                 PC_names=PC_names,
                 mod_rda=mod_rda,
                 r2=r2,
                 eigenvalues=eigenvalues,
                 model_significance=model_significance,
                 axis_significance=axis_significance,
                 vif=vif),
    creation_time = Sys.time(),
    class = "myrda"
  )
  }



# Function to print S3 object of class 'myrda'
# ============================================
print.myrda <- function(x,...){

  print_message <- c(
    paste0("  RDA formula: ", x$form_rda),
    paste0("  Creation time: ", format(attr(x, "creation_time"))),
    paste0(c("  List elements:", names(x)),collapse="   ")
    )

  cat(
    print_message,
    sep = "\n"
  )

}


# Function to run the RDA models and store the info in a S3 object of class 'myrda'
# =================================================================================
runRDA <- function(df,geno_pop,selected_var,pop_structure_correction){

  # write RDA formulas
form_clim_var <- paste(selected_var,collapse= " + ")
if(pop_structure_correction==T) {
  PC_names <- colnames(df) %>% stringr::str_subset("^PC")
  form_pgs_var <- PC_names %>%  paste(collapse= " + ")
  form_rda <- paste("geno_pop ~ ", form_clim_var,"+ Condition(",paste(c(form_pgs_var),collapse=" + " ), ")")
} else {
  PC_names = NULL
  form_rda <- paste("geno_pop ~ ", form_clim_var) 
  }
mod_rda <- rda(as.formula(form_rda), df)
r2 <- RsquareAdj(mod_rda)
eigenvalues <- summary(eigenvals(mod_rda, model = "constrained"))
model_significance <- anova.cca(mod_rda, parallel=getOption("mc.cores")) # default is permutation=999
axis_significance <- anova.cca(mod_rda, by="axis", parallel=getOption("mc.cores"))
vif <- vif.cca(mod_rda)

return(new_RDA(form_rda=form_rda,
               selected_var=selected_var,
               PC_names=PC_names,
               mod_rda=mod_rda,
               r2=r2,
               eigenvalues=eigenvalues,
               model_significance=model_significance,
               axis_significance=axis_significance,
               vif=vif))
  
}



# Function to generate a pdf with the RDA model outputs
# =====================================================
viz_RDAsummary <- function(x){
  
  # check the class of x
  stopifnot("x is not of class myrda" = class(x)=="myrda")
  
  # Generate R2 table
r2_tab <- x$r2 %>% 
  as.data.frame() %>%
  pivot_longer(everything()) %>%  
  column_to_rownames(var ="name") %>% 
  round(2) %>% 
  ggtexttable(rows = row.names(.),cols = NULL, theme = ttheme("blank"))
  
  # Generate table of eigenvalues and proportion of variance explained
eigenvalues_tab <- x$eigenvalues %>% 
  as.data.frame() %>% 
  round(2) %>% 
  ggtexttable(rows = rownames(.), theme = ttheme("blank")) %>% 
  tab_add_hline(at.row = 1:2, row.side = "top", linewidth = 2)

  # Generate table of model and axis significance
signi_tab <- bind_rows(as.data.frame(x$model_significance[-nrow(x$model_significance),]),
          as.data.frame(x$axis_significance[-nrow(x$axis_significance),])) %>% 
  dplyr::mutate(across(!Df, ~ round(.x,3))) %>% 
  ggtexttable(rows = rownames(.), theme = ttheme("blank")) %>% 
  tab_add_hline(at.row = 1:2, row.side = "top", linewidth = 2) %>% 
  tab_add_hline(at.row = 3, row.side = "top", linewidth = 0.01)

  # Generate table for VIF (Variance Inflation Factors)
vif_tab <- x$vif %>% 
  as.data.frame() %>% 
  set_colnames("VIF") %>% 
  round(2) %>% 
  ggtexttable(rows = rownames(.), theme = ttheme("blank")) 
  
  # Generate the screeplot of the eigenvalues
ggscreeplot <- x$eigenvalues %>% #
  as.data.frame() %>% 
  dplyr::filter(row.names(.) == "Eigenvalue")  %>% 
  pivot_longer(everything(),names_to="PC",values_to="eigenvalues") %>%
  ggplot(aes(x= PC,
             y=eigenvalues,
             group=1)) +
  geom_point(size=4)+
  geom_line() +
  ylab("Eigen values") + xlab("") + 
  labs(title="Scree plot") + 
  theme_bw()


ggarrange(ggarrange(r2_tab,signi_tab,vif_tab,nrow=3),
           ggarrange(eigenvalues_tab,ggscreeplot,nrow=2), 
           nrow = 1, 
           ncol = 2) %>% 
annotate_figure(top = text_grob(x$form_rda, 
                                size = 15, 
                                color = 'black', 
                                face = 'bold'))
  
}


# Function to plot the RDA models
# ==============================
plot_RDA <- function(df,x){
  
  # check the class of x
  stopifnot("x is not of class myrda" = class(x)=="myrda")
  
  par(mfrow=c(1,2))
  rdaplots <-  lapply(2:3, function(second_axis){
    
    # Plot the RDA without plotting the points initially
    plot(x$mod_rda, type="n", scaling=3, choices=c(1,second_axis))
    
    # Plot species (SNPs) points
    points(x$mod_rda, display="species", pch=20, cex=0.7, col="gray32", scaling=3, choices=c(1,second_axis))
    
    # Plot site (populations) points
    points(x$mod_rda, display="sites", pch=21, cex=1.3, col="gray32", scaling=3, bg="lightslateblue", choices=c(1,second_axis))
    
    # Add text for biplot arrows
    text(x$mod_rda, scaling=3, display="bp", col="mediumseagreen", cex=1, choices=c(1,second_axis))
    
    # Extract site coordinates
    site_scores <- scores(x$mod_rda, display = "sites", scaling = 3, choices = c(1, second_axis))

    # Add text labels for populations using extracted coordinates and adj parameter
    site_scores[rownames(site_scores)=="RD","RDA1"] <- site_scores[rownames(site_scores)=="RD","RDA1"] -0.05
    text(site_scores[, 1], site_scores[, 2], labels = rda_df$PopulationCode, col = "lightslateblue", cex = 1, adj = c(0, 1.5))
    recordPlot()
    
  })
  
  title(x$form_rda, line=-3,outer = TRUE)
}

Code

# We run the RDA with one set of climatic variable and with or without correction for population structure
rda_combinations <- list(
  list(selected_var = clim_var_avg_selected,
       pop_structure_correction = FALSE),
  list(selected_var = clim_var_avg_selected,
       pop_structure_correction = TRUE))

# RDA dataset 
rda_df <- climdf %>% 
  dplyr::filter(timeslice=="1980-2000" & ensemble_member =="01") %>% 
  dplyr::select(-time_averages,-clim_var) %>% 
  pivot_wider(names_from = clim_var_avg, values_from = value) %>% 
  dplyr::mutate(across(all_of(clim_var_avg_selected), ~ (. - mean(.)) / sd(.))) %>% 
  inner_join(PCs, by="PopulationCode")

Code

# Run the RDA models and store their information
rda_models <- lapply(rda_combinations, function(x){
  runRDA(df = rda_df,
         geno_pop = geno,
         selected_var = x$selected_var,
         pop_structure_correction = x$pop_structure_correction)})

# save the RDA models and their information
saveRDS(rda_models, here("outputs/ScotsPine/RDA/RDAmodels.rds"))

# viz RDA models and their summary information
pdf(width = 13, height = 8, here("figs/ScotsPine/RDA/RDAsummary.pdf"))
lapply(rda_models, viz_RDAsummary)
dev.off()

# plot RDA models
pdf(width = 20, height = 12, here("figs/ScotsPine/RDA/RDAplots.pdf"))
lapply(rda_models, function(x) plot_RDA(df=rda_df,x=x))
dev.off()

2.0.2 Identifying outliers

Different methods can be used to identify outlier SNPs.

Outliers can be identified based on their extremeness along a distribution of Mahalanobis distances estimated between each locus and the center of the RDA space using K number of axes (Capblancq et al. 2018). The number of axes used (K) is determined by looking at the amount of information retained on the different axes of the RDA (Capblancq et al. 2018). In our study, we use K=2. \(p\)-values and \(q\)-values are then calculated for each SNP with the rdadapt function from Capblancq and Forester (2021)
- 1.1 Capblancq et al. (2020) uses a \(p\)-value threshold with a Bonferroni correction to account for multiple testing: \(p\)-value < 0.01 / number of SNPs (see code).
- 1.2 Capblancq and Forester (2021) (code) rank-ordered the SNPs based on their \(p\)-values (the \(p\)-value of a given SNP captures how far its Mahalanobis distance is from the distribution of Mahalanobis distances of all SNPs) and identified the 0.2% or 0.5% of the SNPs with the lowest \(p\)-values.
- 1.3 in our study, we use a FDR (False Discovery Rate) threshold of 5% applied on the \(q\)-values (see François et al. 2016).
Outliers can also been identified based on the extreme loadings on each retained axis. Method used in Forester et al. (2018) (see code, in which a 3 standard deviation cutoff is used (two-tailed \(p\)-value = 0.0027). In our study, we use a 3 standard deviation cutoff.

In the present study, we identified the outlier SNPs with both methods 1.3 and 2, but then we only kept outlier SNPs identified in 1.3 in the following analyses (i.e., to predict the genomic offset).

Code

# ====================================
# Function to identify the outlier SNPs
# =====================================

identify_outliers <- function(rda_model, df, geno_pop){

# outlier detection based on  extreme Mahalanobis distances from the RDA center
# =============================================================================

# Genome scan with K = 2 (degrees of freedom of the X2 distribution)
  # rdadapt is a function to conduct a RDA based genome scan (from Capblancq & Forester 2021)
GSout <- rdadapt(rda_model$mod_rda, 2) %>%  
  as_tibble() %>% 
  mutate(snp=colnames(geno_pop)) 

###############################
# Plot the p-values and q-values
################################
# Common features of the plots
common_theme <- theme_bw()
common_fill_color <- "steelblue"
common_color <- "gray70"
common_alpha <- 0.7
common_binwidth <- 0.02
axis_text_size <- 13
axis_title_size <- 13

# Create histogram for pvalues
p1 <- ggplot(GSout, aes(x = pvalues)) +
  geom_histogram(binwidth = common_binwidth, fill = common_fill_color, color = common_color, alpha = common_alpha) +
  xlab("p-values") +
  ylab("Frequency") +
  common_theme +
  theme(axis.text = element_text(size=axis_text_size),
        axis.title = element_text(size=axis_title_size))

# Create histogram for qvalues
p2 <- ggplot(GSout, aes(x = qvalues)) +
  geom_histogram(binwidth = common_binwidth, fill = common_fill_color, color = common_color, alpha = common_alpha) +
  xlab("q-values") +
  ylab("") +
  common_theme +
  theme(axis.text = element_text(size=axis_text_size),
        axis.title = element_text(size=axis_title_size))

# Arrange the two histograms side by side
grid.arrange(p1, p2, ncol = 2, top = rda_model$form_rda)



# P-values threshold after Bonferroni correction
outliers_maha <- GSout %>%
  filter(pvalues <  0.01/nrow(.)) %>%
  mutate(maha_meth = TRUE) %>%
  dplyr::select(snp,maha_meth) # we do not keep the p.values and q.values in the outlier table

# FDR threshold of 5%
outliers_maha <- GSout %>%
  filter(qvalues <  0.05) %>%
  mutate(maha_meth = TRUE) %>%
  dplyr::select(snp,maha_meth) # we do not keep the p.values and q.values in the outlier table

# P-values are rank-ordered and we select the 0.5% with the lowest p-values
# outliers_maha <- GSout %>% 
#   arrange(pvalues) %>% 
#   slice(1:(0.005*nrow(.))) %>% 
#   mutate(maha_meth = TRUE) %>% 
#   dplyr::select(snp,maha_meth) # we do not keep the p.values and q.values in the outlier table


# outlier detection based on  extreme axis loadings
# =================================================

# As we identify the SNPs that has loadings along the significant axes,
# we first extract the number of significant axes 
nb_signi_axis <- rda_model$axis_significance %>% 
  as.data.frame() %>% 
  dplyr::rename(pvalue="Pr(>F)") %>% 
  dplyr::filter(pvalue<0.05) %>% 
  nrow()
  
if(nb_signi_axis>0){
loads <- scores(rda_model$mod_rda, choices=c(1:(nrow(as.data.frame(rda_model$axis_significance))-1)), display="species")

outliers_loads <- lapply(1:nb_signi_axis, function(id.axis){
  out <- detectoutliers(loads[,id.axis],3) # function to identify outliers based on their RDA loadings (from  Forester et al. 2018)
  tibble(
    #axis=rep(id.axis,times=length(out)), # we do not keep the information about the axis
    snp = names(out),
    load_meth = TRUE,
    # loading = out # we do not keep the loadings
  )
}) %>% 
  list_rbind() %>% 
  distinct()

# Merge outliers identified by the two methods
# ============================================

all_outliers <- full_join(outliers_maha,outliers_loads, by="snp") %>% 
  mutate(across(c(maha_meth,load_meth), ~replace_na(.,FALSE)))

} else {
    
all_outliers <- outliers_maha %>% mutate(load_meth=FALSE)
  }


all_outliers <- map(all_outliers$snp, function(snp){
 cordf <-  cor(df[,names(rda_model$vif)],geno_pop[,snp]) %>% 
    as.data.frame() %>% 
    rownames_to_column() %>% 
    pivot_wider(names_from="rowname", values_from = "V1") %>% 
     mutate(max_var = names(which.max(abs(.)))) %>% 
     mutate(max_var_clim = names(which.max(abs(.[,rda_model$selected_var]))))
}) %>%
  setNames(all_outliers$snp) %>% 
  list_rbind(names_to = "snp") %>% 
  inner_join(all_outliers,by="snp")


list(rda_model = rda_model,
     GSout = GSout, # keep the p.values for the Manhattan plot
     outliers = all_outliers)

}

We plot below the histograms of \(p\)-values and \(q\)-values of the SNPs.

Code

# We load the RDA models
rda_models <- readRDS(here("outputs/ScotsPine/RDA/RDAmodels.rds"))

# we identify the outlier SNPs for the four RDA models
rda_outliers <- rda_models %>% 
  lapply(function(x) {
    identify_outliers(rda_model = x, df = rda_df, geno_pop = geno)
  })

Code

saveRDS(rda_outliers, here("outputs/ScotsPine/RDA/IdentificationOutliers/RDA_outliers.rds"))

2.0.3 Outlier visualization

2.0.3.1 RDA plots based on Forester et al. (2018)

We visualize the outliers with RDA plots following Forester et al. (2018) (see code). Figures are stored in RDAplots_outliers_1.pdf.

Code

# Function to make one RDA plot with different colors for outlier SNPs
make_one_outlier_rda_plot <- function(mod_rda, data_plots, xlim, ylim, second_axis,arrow_length,outline_col){
  
plot(mod_rda, type="n", scaling=3, xlim=xlim, ylim=ylim, choices=c(1,second_axis))
points(mod_rda, display="species", pch=21, cex=1, col=data_plots$allsnps$nooutlier_color, bg=data_plots$allsnps$nooutlier_bg , scaling=3, choices=c(1,second_axis))
points(mod_rda, display="species", pch=21, cex=1, col=data_plots$allsnps$outlier_color, bg=data_plots$allsnps$outlier_bg, scaling=3, choices=c(1,second_axis))
text(mod_rda, scaling=3, display="bp", col="#0868ac", cex=1, arrow.mul=arrow_length, choices=c(1,second_axis))
legend("topleft", legend=data_plots$legend_names, bty="n", col=outline_col, pch=21, cex=1, pt.bg=data_plots$legend_colors,ncol=1,title=data_plots$legend_title) 
recordPlot()

}


# Function that combines two types of plot:
  # plots in which outliers are colored according to their most associated climatic variable
  # plots in which outliers are colored according to the detection method used to identify them:
      # a method based on the Mahalanobis distances from the center of the RDA space
      # a method based on the extreme SNP loadings on each significant axis
combine_rda_outlier_plots <- function(rda_out,color_clim_var){

# Storing data and options to generate the plots in a list
list_data_plots <- list(clim=list(), # options/data specific to plots with colors based on the climatic variables
                        meth=list()) # options/data specific to plots with colors based on the detection methods


# Figure options shared between the two types of plot
# ===================================================

nooutliers_bg <- '#f1eef6'
outline_col <- 'gray32'
transparent_col <- rgb(0,1,0, alpha=0)


  
# Fig options/data specific to plots based on the climatic variables
# ==================================================================

# Attribute one color to each climatic variable 
selected_var_colors <- tibble(max_var_clim = names(color_clim_var),
                              outlier_bg = color_clim_var) %>% 
  dplyr::filter(max_var_clim %in% rda_out$rda_model$selected_var)

# Generate a dataframe with color information for outliers
outlier_colors <-  rda_out$outliers %>% 
  dplyr::select(snp, max_var_clim) %>% 
  left_join(selected_var_colors, by="max_var_clim")
 
# Generate a dataframe with color information for all SNPs
list_data_plots$clim$allsnps <- tibble(snp=rownames(rda_out$rda_model$mod_rda$CCA$v)) %>% 
  left_join(outlier_colors, by="snp") %>% 
  mutate(outlier_bg = replace_na(outlier_bg,transparent_col),
         outlier_color = ifelse(outlier_bg == transparent_col, transparent_col, outline_col),
         nooutlier_bg = ifelse(outlier_bg == transparent_col, nooutliers_bg, transparent_col),
         nooutlier_color = ifelse(outlier_bg == transparent_col, outline_col, transparent_col))

# Legend options
list_data_plots$clim$legend_title <- 'Climatic variable'
list_data_plots$clim$legend_names <- selected_var_colors$max_var_clim
list_data_plots$clim$legend_colors <- selected_var_colors$outlier_bg

  
# Fig options/data specific to plots based on the detection methods
# =================================================================

# Attribute one color to each method
method_colors <- c('#FCF926','#3BCD24','#CD24B2') %>% setNames(c("Mahalanobis distance","Axis loadings", "Both"))

# Legend options
list_data_plots$meth$legend_colors <- method_colors
list_data_plots$meth$legend_names <- names(method_colors)
list_data_plots$meth$legend_title <- 'Outlier detection method'


# attribute colors to all SNPs
list_data_plots$meth$allsnps <-  tibble(snp=rownames(rda_out$rda_model$mod_rda$CCA$v)) %>% 
  left_join(rda_out$outliers[,c("snp", "load_meth", "maha_meth")],by="snp") %>%  
  mutate(outlier_bg = case_when(load_meth == TRUE & maha_meth == TRUE ~ method_colors["Both"],
                                load_meth == TRUE & maha_meth == FALSE ~ method_colors["Axis loadings"],
                                load_meth == FALSE & maha_meth == TRUE ~ method_colors["Mahalanobis distance"],
                                is.na(load_meth) & is.na(maha_meth) ~ transparent_col),
         outlier_color = ifelse(outlier_bg == transparent_col, transparent_col, outline_col),
         nooutlier_bg = ifelse(outlier_bg == transparent_col, nooutliers_bg, transparent_col),
         nooutlier_color = ifelse(outlier_bg == transparent_col, outline_col, transparent_col))


# Generate the RDA plots
# =====================

# For RDA models with 2 explanatory variables, we can only plot RDA1 vs RDA2

if(rda_out$rda_model$axis_significance %>% as.data.frame() %>% nrow()-1>2) { # rda models with more than 2 climatic variables
    
par(mfrow=c(2,2))
lapply(2:3, function(second_axis){ # generate two plots: RDA1 vs RDA2 and RDA1 vs RDA3
    lapply(list_data_plots, function(x) make_one_outlier_rda_plot(mod_rda = rda_out$rda_model$mod_rda,
                                                                  data_plots = x,
                                                                  xlim=c(-1,1), 
                                                                  ylim=c(-0.5,0.5),
                                                                  arrow_length=1.25,
                                                                  outline_col=outline_col,
                                                                  second_axis=second_axis))

  })} else {  # rda models with 2 climatic variables

par(mfrow=c(1,2))
    lapply(list_data_plots, function(x) make_one_outlier_rda_plot(mod_rda = rda_out$rda_model$mod_rda,
                                                                  data_plots = x,
                                                                  xlim=c(-0.3,0.3), 
                                                                  ylim=c(-0.5,0.5),
                                                                  outline_col=outline_col,
                                                                  arrow_length=0.6,
                                                                  second_axis=2))
    
  }

  title(rda_out$rda_model$form_rda, line=-3,outer = TRUE) # rda model formula as title
   
}

Code

# Define the colors associated with each climatic variable for the following visualizations
color_clim_var <- sample(c('#1f78b4','#a6cee3','#6a3d9a','#e31a1c','#33a02c','#ffff33','#fb9a99',
                    '#FF7FE6', '#FFB675', '#51FF74','#B4FF32'), length(clim_var_avg_selected), replace=F) %>%
  setNames(clim_var_avg_selected)

Code

# Plot RDA outliers with different colors based either on the detection method or the most associated climatic variable
pdf(width = 12, height = 8, here("figs/ScotsPine/RDA/RDAplots_outliers_1.pdf"))
lapply(rda_outliers, function(x) combine_rda_outlier_plots(rda_out = x,color_clim_var = color_clim_var))
dev.off()

2.0.3.2 RDA plots based on Capblancq and Forester (2021)

We visualize the outliers with RDA and Manhattan plots following Capblancq and Forester (2021) (code). Figures are stored in RDAplots_outliers_2.pdf.

Code

make_rda_ggplot <- function(data_plots,TAB_var){ 

  data_plots$tab[order(data_plots$tab$snp_type),] %>% 
  ggplot() +
  geom_hline(yintercept=0, linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_vline(xintercept=0, linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_point(aes(x=RDA1*20, y=RDA2*20, colour = snp_type), size = 1.4) +
  scale_color_manual(values = data_plots$legend_colors) +
  geom_segment(data = TAB_var, aes(xend=RDA1, yend=RDA2, x=0, y=0), 
               colour="black", linewidth=0.15, linetype=1, arrow=arrow(length = unit(0.02, "npc"))) +
  geom_text(data = TAB_var, aes(x=1.1*RDA1, y=1.1*RDA2, label = row.names(TAB_var)), size = 2.5) +
  xlab("RDA 1") + ylab("RDA 2") +
  facet_wrap(~"RDA space") +
  guides(color=guide_legend(title=data_plots$legend_title)) +
  theme_bw(base_size = 11) +
  theme(panel.background = element_blank(), 
        legend.background = element_blank(), 
        panel.grid = element_blank(), 
        plot.background = element_blank(), 
        legend.text=element_text(size=rel(.8)), 
        strip.text = element_text(size=11))
  
  
  
}

make_manhattan_plot <- function(data_plots){ 

data_plots$tab[order(data_plots$tab$snp_type),] %>% 
ggplot() +
  geom_hline(yintercept=-log10(0.01/length(data_plots$tab$pvalues)), linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_point(aes(x=pos, y=-log10(pvalues), col = snp_type), size=1.4) +
  scale_color_manual(values = data_plots$legend_colors) +
  xlab("Loci") + ylab("-log10(p-values)") +
  facet_wrap(~"Manhattan plot", nrow = 3) +
  guides(color=guide_legend(title=data_plots$legend_title)) +
  theme_bw(base_size = 11) +
  theme(legend.position="right", legend.background = element_blank(), panel.grid = element_blank(), legend.box.background = element_blank(), plot.background = element_blank(), panel.background = element_blank(), legend.text=element_text(size=rel(.8)), strip.text = element_text(size=11))
  }


# Function that combines two types of plot:
  # plots in which outliers are colored according to their most associated climatic variable
  # plots in which outliers are colored according to the detection method used to identify them:
      # a method based on the Mahalanobis distances from the center of the RDA space
      # a method based on the extreme SNP loadings on each significant axis
combine_rda_outlier_ggplots <- function(rda_out,color_clim_var){

# Storing data and options to generate the plots in a list
list_data_plots <- list(clim=list(), # options/data specific to plots with colors based on the climatic variables
                        meth=list()) # options/data specific to plots with colors based on the detection methods


# Figure options shared between the two types of plot
# ===================================================

nooutliers_bg <- "gray90"
nooutliers_legend <- "No detection"

locus_scores <- scores(rda_out$rda_model$mod_rda, 
                       choices=c(1:2), # we use K=2 number of axes
                       display="species", # 'species' in the vegan package correspond to the loci
                       scaling="none") %>% 
  as.data.frame() %>% 
  rownames_to_column("snp") %>% 
  as_tibble() %>% 
  inner_join(rda_out$GSout, by="snp") %>% 
  mutate(pos=seq(1:nrow(.)))

TAB_var <- as.data.frame(scores(rda_out$rda_model$mod_rda, choices=c(1,2), display="bp"))
  
# Fig options/data specific to plots based on the climatic variables
# ==================================================================

# Legend options
vec_color_clim_var <- color_clim_var[names(color_clim_var) %in% unique(rda_out$outliers$max_var_clim)]
list_data_plots$clim$legend_title <- 'Climatic variable'
list_data_plots$clim$legend_names <- c(nooutliers_legend,names(vec_color_clim_var))
list_data_plots$clim$legend_colors <- c(nooutliers_bg,vec_color_clim_var) %>% unname()


list_data_plots$clim$tab <-  locus_scores %>% 
  left_join(rda_out$outliers[,c("snp", "max_var_clim")],by="snp") %>%  
  mutate(max_var_clim = replace_na(max_var_clim,nooutliers_legend)) %>% 
  mutate(max_var_clim =factor(max_var_clim,levels=list_data_plots$clim$legend_names)) %>% 
  dplyr::rename(snp_type = max_var_clim)


# Fig options/data specific to plots based on the detection methods
# =================================================================


# Legend options
list_data_plots$meth$legend_colors <- c(nooutliers_bg,'#FCF926','#3BCD24','#CD24B2')
list_data_plots$meth$legend_names <- c(nooutliers_legend,"Mahalanobis distance","Axis loadings", "Both")
list_data_plots$meth$legend_title <- 'Outlier detection'


list_data_plots$meth$tab <-   locus_scores %>% 
  left_join(rda_out$outliers[,c("snp", "load_meth", "maha_meth")],by="snp") %>%  
  mutate(snp_type = case_when(load_meth == TRUE & maha_meth == TRUE ~  list_data_plots$meth$legend_names[[4]],
                              load_meth == TRUE & maha_meth == FALSE ~ list_data_plots$meth$legend_names[[3]],
                              load_meth == FALSE & maha_meth == TRUE ~ list_data_plots$meth$legend_names[[2]],
                              is.na(load_meth) & is.na(maha_meth) ~ list_data_plots$meth$legend_names[[1]])) %>% 
  mutate(snp_type=factor(snp_type,levels=list_data_plots$meth$legend_names)) 



# Generate the RDA plots
# =====================


p1 <- lapply(list_data_plots, function(x) make_rda_ggplot(data_plots = x,
                                                         TAB_var=TAB_var))

p2 <- lapply(list_data_plots, function(x) make_manhattan_plot(data_plots = x))

plot_row <- plot_grid(p1$clim, p1$meth,p2$clim,p2$meth, ncol=2)

# Title options
title <- ggdraw() + 
  draw_label(
    rda_out$rda_model$form_rda,
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )

# merge title and plots
plot_grid(
  title, plot_row,
  ncol = 1,
  # rel_heights values control vertical title margins
  rel_heights = c(0.1, 1)
)

   
}

Code

# Plot RDA outliers with different colors based either on the detection method or the most associated climatic variable
pdf(width = 12, height = 8, here("figs/ScotsPine/RDA/RDAplots_outliers_2.pdf"))
lapply(rda_outliers, function(x) combine_rda_outlier_ggplots(rda_out = x, color_clim_var=color_clim_var))
dev.off()

The figures RDA_outlier_plots_pRDA.pdf and RDA_outlier_plots_pRDA.pdf generated below show only the outlier SNPs identified based on the Mahanobilis distances along the first two axes and a FDR threshold of 5% (i.e., the outlier SNPs that are then used to predict the genomic offset).

Code

# =========================================
# Function to generate the SI outlier plots
# =========================================
generate_SI_outlier_plots <- function(rda_out){
  
  
locus_scores <- scores(rda_out$rda_model$mod_rda, 
                       choices=c(1:2), # we use K=2 number of axes
                       display="species", # 'species' in the vegan package correspond to the loci
                       scaling="none") %>% 
  as.data.frame() %>% 
  rownames_to_column("snp") %>% 
  as_tibble() %>% 
  inner_join(rda_out$GSout, by="snp") %>% 
  mutate(pos=seq(1:nrow(.)))

TAB_var <- as.data.frame(scores(rda_out$rda_model$mod_rda, choices=c(1,2), display="bp"))

# Legend options
legend_colors <- c("gray90",'#CD24B2')
legend_names <- c("Non-outliers","Outliers")
legend_title <- 'Outlier detection'


tab <-   locus_scores %>% 
  left_join(rda_out$outliers[,c("snp", "maha_meth")],by="snp") %>%  
  mutate(snp_type = case_when(maha_meth == TRUE ~  legend_names[[2]],
                              is.na(maha_meth) | maha_meth==FALSE ~ legend_names[[1]])) %>% 
  mutate(snp_type=factor(snp_type,levels=legend_names)) 


# Plot with RDA space
# ===================
p1 <- tab[order(tab$snp_type),] %>% 
  ggplot() +
  geom_hline(yintercept=0, linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_vline(xintercept=0, linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_point(aes(x=RDA1*20, y=RDA2*20, colour = snp_type), size = 1.4) +
  scale_color_manual(values = legend_colors) +
  geom_segment(data = TAB_var, aes(xend=RDA1, yend=RDA2, x=0, y=0), 
               colour="chartreuse3", linewidth=0.15, linetype=1, arrow=arrow(length = unit(0.02, "npc"))) +
  geom_text(data = TAB_var, aes(x=0.9*RDA1, y=1.1*RDA2, label = row.names(TAB_var)), size = 2.5, colour="chartreuse4") +
  xlab("RDA 1") + ylab("RDA 2") +
  facet_wrap(~"RDA space") +
  theme_bw(base_size = 11) +
  theme(panel.background = element_blank(), 
        legend.position = "none",
        legend.background = element_blank(), 
        panel.grid = element_blank(), 
        plot.background = element_blank(), 
        legend.text=element_text(size=rel(.8)), 
        strip.text = element_text(size=11))


# Manhattan plot
# ==============
p2 <- tab[order(tab$snp_type),] %>% 
ggplot() +
  geom_hline(yintercept=-log10(0.01/length(tab$pvalues)), linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_point(aes(x=pos, y=-log10(pvalues), col = snp_type), size=1.4) +
  scale_color_manual(values = legend_colors) +
  xlab("Loci") + ylab("-log10(p-values)") +
  facet_wrap(~"Manhattan plot", nrow = 3) +
  theme_bw(base_size = 11)  +
  theme(legend.position="right",  
        panel.grid = element_blank(), 
        legend.title=element_blank(),
        legend.box.background = element_blank(), plot.background = element_blank(), panel.background = element_blank(), legend.text=element_text(size=rel(.8)), strip.text = element_text(size=11))


# Merge the two plots
# ===================
p2_legend <- get_legend(p2)
p2 <- p2 + theme(legend.position = "none")
plot_row <- plot_grid(p1,p2, ncol=2)
plot_row <- plot_grid(plot_row, p2_legend, ncol=2,rel_widths = c(1, 0.1))


# Save the plots as pdf
# =====================
if(grepl("Condition",rda_out$rda_model$form_rda)==TRUE){rda_name <- "pRDA"} else { rda_name <- "RDA"}
ggsave(plot_row, device = "pdf", filename = here(paste0("figs/ScotsPine/RDA/RDA_outlier_plots_",rda_name,".pdf")), width=12, height=5)
}


# Run the function for the RDA models fitted on the set 2 of climatic variables
lapply(rda_outliers, function(x) generate_SI_outlier_plots(rda_out = x))

Warning in get_plot_component(plot, "guide-box"): Multiple components found; returning the first one. To return all, use `return_all = TRUE`.
Warning in get_plot_component(plot, "guide-box"): Multiple components found; returning the first one. To return all, use `return_all = TRUE`.

2.0.4 Common outliers btw RDA and pRDA

The Venn diagram below shows the common outliers among those identified with RDA and pRDA (considering all outliers, i.e. also those identified based on their loadings along the significant axes)

Code

# all outliers
list("RDA" = rda_outliers[[1]]$outliers$snp,
       "pRDA" = rda_outliers[[2]]$outliers$snp) %>% 
  ggVennDiagram(lty="solid", size=0.2, label = "count", label_alpha=0) + 
  scale_fill_gradient2(low = "white", high = 'darkgoldenrod3') + 
  scale_color_manual(values = rep("darkgoldenrod1",6)) + 
  guides(fill = "none") + 
  theme(text = element_text(size=16)) + 
  scale_x_continuous(expand = expansion(mult = .2))

The Venn diagram below shows the common outliers between those identified with RDA and pRDA (considering only outliers identified with the Mahalanobis distances along the first two axes, i.e. those that will be used for genomic offset predictions).

Code

# Maha outliers
list("RDA" = rda_outliers[[1]]$outliers %>% dplyr::filter(maha_meth==TRUE) %>% .$snp,
       "pRDA" = rda_outliers[[2]]$outliers%>% dplyr::filter(maha_meth==TRUE) %>% .$snp) %>% 
  ggVennDiagram(lty="solid", size=0.2, label = "count", label_alpha=0) + 
  scale_fill_gradient2(low = "white", high = 'darkgoldenrod3') + 
  scale_color_manual(values = rep("darkgoldenrod1",6)) + 
  guides(fill = "none") + 
  theme(text = element_text(size=16)) + 
  scale_x_continuous(expand = expansion(mult = .2))

2.0.5 Retained outliers

We keep the outliers identified with the Mahalanobis distances and a FDR threshold of 5%.

Code

rda_outliers <- list("RDA" = list(set_name = "Outliers identified with RDA",
                                  set_code = "RDA_outliers",
                                  outliers = rda_outliers[[1]]$outliers %>% dplyr::filter(maha_meth==TRUE) %>% .$snp),
                     "pRDA" =  list(set_name = "Outliers identified with pRDA",
                                  set_code = "pRDA_outliers",
                                  outliers = rda_outliers[[2]]$outliers%>% dplyr::filter(maha_meth==TRUE) %>% .$snp))

3 RDA Genomic offset

In the present report, genomic offset is predicted under the future scenario RCP8.5 (pessimistic), ensemble member 01 and timeslice 2030-2050.

Code

EM_i <- "01"
rcp_scenario_i <- "RCP8.5" 
timeslice_pred <- "2030-2050"

3.1 Climatic data

Code

climdf <- readRDS(here("data/ScotsPine/subdf.rds")) %>% 
  dplyr::filter(rcp_scenario == rcp_scenario_i &
                ensemble_member == "01") %>% 
  dplyr::select(-clim_var, -time_averages) %>% 
  pivot_wider(names_from=clim_var_avg, values_from =value)

clim_ref <- climdf %>% dplyr::filter(timeslice == "1980-2000")
clim_pred <- climdf %>% dplyr::filter(timeslice == timeslice_pred)

# Scaling the datasets
scale_params <- lapply(clim_var_avg_selected, function(x){
    
    vec_var <- clim_ref[,x] %>% pull()
    
    list(mean = mean(vec_var),
         sd = sd(vec_var))
    
  }) %>% setNames(clim_var_avg_selected)

# Scale climatic data across the reference period
clim_ref <- clim_ref %>% 
  dplyr::mutate(across(any_of(clim_var_avg_selected), ~ (. - mean(.)) / sd(.)))

# Scale climatic data across the future period
for(i in clim_var_avg_selected){
  clim_pred[,i] <- (clim_pred[,i] - scale_params[[i]]$mean) / scale_params[[i]]$sd
}

3.2 RDA models

Code

runRDA <- function(snp_set, clim_var, clim_df, geno){
  
# Keep the genomic data of the selected SNPs
geno <- geno[,snp_set]
  
# RDA formula  
form_rda <- paste("geno ~ ", paste(clim_var,collapse= " + "))
  
# run RDA with only the selected SNPs
rda_outliers <- rda(as.formula(form_rda), clim_df)

}

Code

rda_list <- lapply(rda_outliers, function(x) {
  
x$rda_model <- runRDA(snp_set = x$outliers,
                    clim_var = clim_var_avg_selected,
                    clim_df = clim_ref,
                    geno = geno)
return(x)

}) %>% setNames(names(rda_outliers))

An RDA biplot can be used to visualize the relationship between the selected SNPs and the underlying climatic variables.

Code

make_RDAbiplot <- function(x) {
  
TAB_loci <- as.data.frame(scores(x$rda_model, choices=c(1:2), display="species", scaling="none"))
TAB_var <- as.data.frame(scores(x$rda_model, choices=c(1:2), display="bp"))

eigenvalues <- summary(eigenvals(x$rda_model, model = "constrained")) %>% as.data.frame()
varexp_axis1 <- eigenvalues[2,1] *100 
varexp_axis2 <- eigenvalues[2,2] *100 


ggplot() +
  geom_hline(yintercept=0, linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_vline(xintercept=0, linetype="dashed", color = gray(.80), linewidth=0.6) +
  geom_point(data = TAB_loci, aes(x=RDA1*3, y=RDA2*3), colour = '#CD24B2', size = 2, alpha = 0.8) + #"#F9A242FF"
  geom_segment(data = TAB_var, aes(xend=RDA1, yend=RDA2, x=0, y=0), colour="black", linewidth=0.15, linetype=1, arrow=arrow(length = unit(0.02, "npc"))) +
  geom_text(data = TAB_var, aes(x=1.1*RDA1, y=1.1*RDA2, label = row.names(TAB_var)), size = 3) +
  xlab(paste0("RDA 1 (",round(varexp_axis1,1),"%)")) + 
  ylab(paste0("RDA 2 (",round(varexp_axis2,1),"%)")) +
  facet_wrap(~paste0(x$set_name, " (n = ",length(x$outliers),")")) +
  guides(color=guide_legend(title="Locus type")) +
  theme_bw(base_size = 11) +
  theme(panel.grid = element_blank(), 
        plot.background = element_blank(), 
        panel.background = element_blank(), 
        strip.text = element_text(size=11))

}

Code

rda_biplots <- lapply(rda_list, make_RDAbiplot)

grid_RDAbiplots <- plot_grid(rda_biplots[[1]],rda_biplots[[2]],
                         nrow=1)

ggsave(here("figs/ScotsPine/RDA/RDAbiplots.pdf"), grid_RDAbiplots, width = 14, height=6)

grid_RDAbiplots

3.3 Estimating the genomic offset

Code

# Function to calculate the RDA genetic offset for spatial points
# ===============================================================

# clim_ref = climatic data used to fit the RDA model (the climate-of-origin of the populations)

# clim_pred = climatic data used for predictions (so either the future climate of the populations, 
            # or climate of the common gardens or the NFI plots)

# weights = if NULL, the adaptive index is not weighted by the relative importance of the RDA axes in 
# explaining the variance.

# K = number of RDA axes used to calculate the genomic offset

# snp_set = list with at least the RDA models 


pred_GO_spatial_points <- function(snp_set, 
                                   K, 
                                   clim_ref, 
                                   clim_pred, 
                                   clim_var,
                                   weights = NULL, 
                                   CG=F){
  
# weights based on the variance explained by the different axes
weights <- (snp_set$rda_model$CCA$eig/sum(snp_set$rda_model$CCA$eig))[1:K]


list_AI <- lapply(list(clim_ref,clim_pred), function(df){
  
  lapply(1:K, function(i){
  
# Below we calculate the adaptive index for the axis i
# For that, we multiply the value of the variables at the location of the population
    # by the loadings of the variables along the axis i
ai <- df %>%
  dplyr::select(any_of(clim_var)) %>% 
  apply(1, function(x) sum(x * snp_set$rda_model$CCA$biplot[,i]))

if(!is.null(weights)){ai <- ai * weights[i]}
    
  }) %>% 
    setNames(paste0("RDA", 1:length(.))) %>% 
    as.data.frame()
  
})

if(CG==F){
  
eucloffset <- unlist(lapply(1:nrow(list_AI[[1]]), function(x) dist(rbind(list_AI[[1]][x,], list_AI[[2]][x,]), method = "euclidean")))

} else if(CG==T){
  
eucloffset <- lapply(1:nrow(list_AI[[2]]), function(x) as.matrix(dist(rbind(list_AI[[2]][x,],list_AI[[1]]), method = "euclidean"))[2:(nrow(list_AI[[1]])+1),1]) %>% 
  setNames(clim_pred[,1] %>% pull()) %>% 
  as.data.frame() %>% 
  cbind(clim_ref[,1])
  
  
  }

return(eucloffset)
}

Code

rda_list <- lapply(rda_list, function(x){
  
x$GO <- pred_GO_spatial_points(snp_set = x,
                               clim_var=clim_var_avg_selected,
                               K = 2, 
                               clim_ref =clim_ref,
                               clim_pred = clim_pred,
                               weights = T)

return(x)
})

3.4 Mapping the genomic offset

We plot the genomic offset predictions at the location of the 21 Scots pine populations.

Code

tibble(PopulationCode = clim_ref$PopulationCode,
       GO = rda_list$RDA$GO) %>%
  make_spatialpoints_map(var="GO",
                         ggtitle="Genomic offset predictions without correction for population structure")

Code

tibble(PopulationCode = clim_ref$PopulationCode,
       GO = rda_list$pRDA$GO) %>%
  make_spatialpoints_map(var="GO",
                         ggtitle="Genomic offset predictions with correction for population structure")

Correlation between genomic offset with or without correction for population structure.

Code

cor(rda_list$RDA$GO,rda_list$pRDA$GO)

[1] 0.8537676

References

Capblancq, Thibaut, and Brenna R. Forester. 2021. “Redundancy Analysis: A Swiss Army Knife for Landscape Genomics.” Methods in Ecology and Evolution 12 (12): 2298–2309. https://doi.org/10.1111/2041-210X.13722.

Capblancq, Thibaut, Keurcien Luu, Michael G. B. Blum, and Eric Bazin. 2018. “Evaluation of Redundancy Analysis to Identify Signatures of Local Adaptation.” Molecular Ecology Resources 18 (6): 1223–33. https://doi.org/10.1111/1755-0998.12906.

Capblancq, Thibaut, Xavier Morin, Maya Gueguen, Julien Renaud, Stéphane Lobreaux, and Eric Bazin. 2020. “Climate-Associated Genetic Variation in Fagus Sylvatica and Potential Responses to Climate Change in the French Alps.” Journal of Evolutionary Biology 33 (6): 783–96. https://doi.org/10.1111/jeb.13610.

Forester, Brenna R., Jesse R. Lasky, Helene H. Wagner, and Dean L. Urban. 2018. “Comparing Methods for Detecting Multilocus Adaptation with Multivariate Genotype–Environment Associations.” Molecular Ecology 27 (9): 2215–33. https://doi.org/10.1111/mec.14584.

François, Olivier, Helena Martins, Kevin Caye, and Sean D. Schoville. 2016. “Controlling False Discoveries in Genome Scans for Selection.” Molecular Ecology 25 (2): 454–69. https://doi.org/10.1111/mec.13513.