Genomic offset predictions in Scots pine with Redundancy Analysis
Author
Juliette Archambeau
Published
June 7, 2024
Aim : identify potential candidate SNPs for adaptation to climate and predict the genomic offset under future climates using the Redundancy Analysis.
1 The data
1.1 Population allele frequencies
Calculated in FormattingScotsPineGenomicData report.
Code
# We load the population allele frequencies filtered for MAFgeno <-readRDS(here("data/ScotsPine/GenomicData/FormattedData/FilteredGenomicData_noMAF_PopAlleleFreq.rds")) %>%arrange(PopulationCode) %>%# to be sure that colors in the PCA graph are attributed to the right individualscolumn_to_rownames("PopulationCode")geno[1:10,1:10] %>%kable_mydf()
AX-117395395
AX-117395404
AX-117395409
AX-117395425
AX-117395439
AX-117395441
AX-117395444
AX-117395445
AX-117395460
AX-117395482
AB
0.23
0.48
0.08
0.37
0.51
0.45
0.08
0.44
0.05
0.03
AC
0.40
0.43
0.05
0.30
0.34
0.35
0.27
0.45
0.09
0.04
AM
0.30
0.45
0.08
0.51
0.37
0.39
0.14
0.41
0.05
0.02
BB
0.48
0.32
0.19
0.32
0.54
0.45
0.11
0.50
0.13
0.07
BE
0.44
0.61
0.06
0.51
0.39
0.40
0.31
0.41
0.12
0.04
BW
0.41
0.31
0.08
0.41
0.45
0.32
0.25
0.37
0.07
0.02
CC
0.38
0.40
0.25
0.33
0.45
0.52
0.30
0.38
0.01
0.04
CG
0.39
0.57
0.08
0.51
0.54
0.42
0.10
0.52
0.10
0.01
CR
0.32
0.58
0.03
0.26
0.65
0.52
0.32
0.51
0.09
0.02
GA
0.45
0.36
0.03
0.39
0.30
0.41
0.19
0.45
0.08
0.04
1.2 Population structure
Principal components calculated with population allele frequencies not filtered for MAF (see PopulationGeneticStructureAndVariancePartioningScotsPine report). We use the first three PCs of a PCA to account for population structure in the pRDA.
We identify outlier SNPs with a RDA not correcting for population structure and a pRDA correcting for population structure (using the three PCS of a PCA).
We use climatic data of the 1980-2000 timeslice and the ensemble member 01 (used as a control in the CHESS-SCAPE climatic data).
2.0.1 RDA models
RDA summary statistics are summarized in the pdfs RDAsummary.pdf and the RDA plots can be found in the pdfs RDAplots.pdf.
Code
# =========# FUNCTIONS# =========# Function to create a S3 object with the RDA model and some info# ===============================================================new_RDA <-function(form_rda,selected_var,PC_names,mod_rda,r2,eigenvalues,model_significance,axis_significance,vif){structure(.Data =list(form_rda=form_rda,selected_var=selected_var,PC_names=PC_names,mod_rda=mod_rda,r2=r2,eigenvalues=eigenvalues,model_significance=model_significance,axis_significance=axis_significance,vif=vif),creation_time =Sys.time(),class ="myrda" ) }# Function to print S3 object of class 'myrda'# ============================================print.myrda <-function(x,...){ print_message <-c(paste0(" RDA formula: ", x$form_rda),paste0(" Creation time: ", format(attr(x, "creation_time"))),paste0(c(" List elements:", names(x)),collapse=" ") )cat( print_message,sep ="\n" )}# Function to run the RDA models and store the info in a S3 object of class 'myrda'# =================================================================================runRDA <-function(df,geno_pop,selected_var,pop_structure_correction){# write RDA formulasform_clim_var <-paste(selected_var,collapse=" + ")if(pop_structure_correction==T) { PC_names <-colnames(df) %>% stringr::str_subset("^PC") form_pgs_var <- PC_names %>%paste(collapse=" + ") form_rda <-paste("geno_pop ~ ", form_clim_var,"+ Condition(",paste(c(form_pgs_var),collapse=" + " ), ")")} else { PC_names =NULL form_rda <-paste("geno_pop ~ ", form_clim_var) }mod_rda <-rda(as.formula(form_rda), df)r2 <-RsquareAdj(mod_rda)eigenvalues <-summary(eigenvals(mod_rda, model ="constrained"))model_significance <-anova.cca(mod_rda, parallel=getOption("mc.cores")) # default is permutation=999axis_significance <-anova.cca(mod_rda, by="axis", parallel=getOption("mc.cores"))vif <-vif.cca(mod_rda)return(new_RDA(form_rda=form_rda,selected_var=selected_var,PC_names=PC_names,mod_rda=mod_rda,r2=r2,eigenvalues=eigenvalues,model_significance=model_significance,axis_significance=axis_significance,vif=vif))}# Function to generate a pdf with the RDA model outputs# =====================================================viz_RDAsummary <-function(x){# check the class of xstopifnot("x is not of class myrda"=class(x)=="myrda")# Generate R2 tabler2_tab <- x$r2 %>%as.data.frame() %>%pivot_longer(everything()) %>%column_to_rownames(var ="name") %>%round(2) %>%ggtexttable(rows =row.names(.),cols =NULL, theme =ttheme("blank"))# Generate table of eigenvalues and proportion of variance explainedeigenvalues_tab <- x$eigenvalues %>%as.data.frame() %>%round(2) %>%ggtexttable(rows =rownames(.), theme =ttheme("blank")) %>%tab_add_hline(at.row =1:2, row.side ="top", linewidth =2)# Generate table of model and axis significancesigni_tab <-bind_rows(as.data.frame(x$model_significance[-nrow(x$model_significance),]),as.data.frame(x$axis_significance[-nrow(x$axis_significance),])) %>% dplyr::mutate(across(!Df, ~round(.x,3))) %>%ggtexttable(rows =rownames(.), theme =ttheme("blank")) %>%tab_add_hline(at.row =1:2, row.side ="top", linewidth =2) %>%tab_add_hline(at.row =3, row.side ="top", linewidth =0.01)# Generate table for VIF (Variance Inflation Factors)vif_tab <- x$vif %>%as.data.frame() %>%set_colnames("VIF") %>%round(2) %>%ggtexttable(rows =rownames(.), theme =ttheme("blank")) # Generate the screeplot of the eigenvaluesggscreeplot <- x$eigenvalues %>%#as.data.frame() %>% dplyr::filter(row.names(.) =="Eigenvalue") %>%pivot_longer(everything(),names_to="PC",values_to="eigenvalues") %>%ggplot(aes(x= PC,y=eigenvalues,group=1)) +geom_point(size=4)+geom_line() +ylab("Eigen values") +xlab("") +labs(title="Scree plot") +theme_bw()ggarrange(ggarrange(r2_tab,signi_tab,vif_tab,nrow=3),ggarrange(eigenvalues_tab,ggscreeplot,nrow=2), nrow =1, ncol =2) %>%annotate_figure(top =text_grob(x$form_rda, size =15, color ='black', face ='bold'))}# Function to plot the RDA models# ==============================plot_RDA <-function(df,x){# check the class of xstopifnot("x is not of class myrda"=class(x)=="myrda")par(mfrow=c(1,2)) rdaplots <-lapply(2:3, function(second_axis){# Plot the RDA without plotting the points initiallyplot(x$mod_rda, type="n", scaling=3, choices=c(1,second_axis))# Plot species (SNPs) pointspoints(x$mod_rda, display="species", pch=20, cex=0.7, col="gray32", scaling=3, choices=c(1,second_axis))# Plot site (populations) pointspoints(x$mod_rda, display="sites", pch=21, cex=1.3, col="gray32", scaling=3, bg="lightslateblue", choices=c(1,second_axis))# Add text for biplot arrowstext(x$mod_rda, scaling=3, display="bp", col="mediumseagreen", cex=1, choices=c(1,second_axis))# Extract site coordinates site_scores <-scores(x$mod_rda, display ="sites", scaling =3, choices =c(1, second_axis))# Add text labels for populations using extracted coordinates and adj parameter site_scores[rownames(site_scores)=="RD","RDA1"] <- site_scores[rownames(site_scores)=="RD","RDA1"] -0.05text(site_scores[, 1], site_scores[, 2], labels = rda_df$PopulationCode, col ="lightslateblue", cex =1, adj =c(0, 1.5))recordPlot() })title(x$form_rda, line=-3,outer =TRUE)}
Code
# We run the RDA with one set of climatic variable and with or without correction for population structurerda_combinations <-list(list(selected_var = clim_var_avg_selected,pop_structure_correction =FALSE),list(selected_var = clim_var_avg_selected,pop_structure_correction =TRUE))# RDA dataset rda_df <- climdf %>% dplyr::filter(timeslice=="1980-2000"& ensemble_member =="01") %>% dplyr::select(-time_averages,-clim_var) %>%pivot_wider(names_from = clim_var_avg, values_from = value) %>% dplyr::mutate(across(all_of(clim_var_avg_selected), ~ (. -mean(.)) /sd(.))) %>%inner_join(PCs, by="PopulationCode")
Code
# Run the RDA models and store their informationrda_models <-lapply(rda_combinations, function(x){runRDA(df = rda_df,geno_pop = geno,selected_var = x$selected_var,pop_structure_correction = x$pop_structure_correction)})# save the RDA models and their informationsaveRDS(rda_models, here("outputs/ScotsPine/RDA/RDAmodels.rds"))# viz RDA models and their summary informationpdf(width =13, height =8, here("figs/ScotsPine/RDA/RDAsummary.pdf"))lapply(rda_models, viz_RDAsummary)dev.off()# plot RDA modelspdf(width =20, height =12, here("figs/ScotsPine/RDA/RDAplots.pdf"))lapply(rda_models, function(x) plot_RDA(df=rda_df,x=x))dev.off()
2.0.2 Identifying outliers
Different methods can be used to identify outlier SNPs.
Outliers can be identified based on their extremeness along a distribution of Mahalanobis distances estimated between each locus and the center of the RDA space using K number of axes (Capblancq et al. 2018). The number of axes used (K) is determined by looking at the amount of information retained on the different axes of the RDA (Capblancq et al. 2018). In our study, we use K=2. \(p\)-values and \(q\)-values are then calculated for each SNP with the rdadapt function from Capblancq and Forester (2021)
1.1 Capblancq et al. (2020) uses a \(p\)-value threshold with a Bonferroni correction to account for multiple testing: \(p\)-value < 0.01 / number of SNPs (see code).
1.2 Capblancq and Forester (2021) (code) rank-ordered the SNPs based on their \(p\)-values (the \(p\)-value of a given SNP captures how far its Mahalanobis distance is from the distribution of Mahalanobis distances of all SNPs) and identified the 0.2% or 0.5% of the SNPs with the lowest \(p\)-values.
1.3 in our study, we use a FDR (False Discovery Rate) threshold of 5% applied on the \(q\)-values (see François et al. 2016).
Outliers can also been identified based on the extreme loadings on each retained axis. Method used in Forester et al. (2018) (see code, in which a 3 standard deviation cutoff is used (two-tailed \(p\)-value = 0.0027). In our study, we use a 3 standard deviation cutoff.
In the present study, we identified the outlier SNPs with both methods 1.3 and 2, but then we only kept outlier SNPs identified in 1.3 in the following analyses (i.e., to predict the genomic offset).
Code
# ====================================# Function to identify the outlier SNPs# =====================================identify_outliers <-function(rda_model, df, geno_pop){# outlier detection based on extreme Mahalanobis distances from the RDA center# =============================================================================# Genome scan with K = 2 (degrees of freedom of the X2 distribution)# rdadapt is a function to conduct a RDA based genome scan (from Capblancq & Forester 2021)GSout <-rdadapt(rda_model$mod_rda, 2) %>%as_tibble() %>%mutate(snp=colnames(geno_pop)) ################################ Plot the p-values and q-values################################# Common features of the plotscommon_theme <-theme_bw()common_fill_color <-"steelblue"common_color <-"gray70"common_alpha <-0.7common_binwidth <-0.02axis_text_size <-13axis_title_size <-13# Create histogram for pvaluesp1 <-ggplot(GSout, aes(x = pvalues)) +geom_histogram(binwidth = common_binwidth, fill = common_fill_color, color = common_color, alpha = common_alpha) +xlab("p-values") +ylab("Frequency") + common_theme +theme(axis.text =element_text(size=axis_text_size),axis.title =element_text(size=axis_title_size))# Create histogram for qvaluesp2 <-ggplot(GSout, aes(x = qvalues)) +geom_histogram(binwidth = common_binwidth, fill = common_fill_color, color = common_color, alpha = common_alpha) +xlab("q-values") +ylab("") + common_theme +theme(axis.text =element_text(size=axis_text_size),axis.title =element_text(size=axis_title_size))# Arrange the two histograms side by sidegrid.arrange(p1, p2, ncol =2, top = rda_model$form_rda)# P-values threshold after Bonferroni correctionoutliers_maha <- GSout %>%filter(pvalues <0.01/nrow(.)) %>%mutate(maha_meth =TRUE) %>% dplyr::select(snp,maha_meth) # we do not keep the p.values and q.values in the outlier table# FDR threshold of 5%outliers_maha <- GSout %>%filter(qvalues <0.05) %>%mutate(maha_meth =TRUE) %>% dplyr::select(snp,maha_meth) # we do not keep the p.values and q.values in the outlier table# P-values are rank-ordered and we select the 0.5% with the lowest p-values# outliers_maha <- GSout %>% # arrange(pvalues) %>% # slice(1:(0.005*nrow(.))) %>% # mutate(maha_meth = TRUE) %>% # dplyr::select(snp,maha_meth) # we do not keep the p.values and q.values in the outlier table# outlier detection based on extreme axis loadings# =================================================# As we identify the SNPs that has loadings along the significant axes,# we first extract the number of significant axes nb_signi_axis <- rda_model$axis_significance %>%as.data.frame() %>% dplyr::rename(pvalue="Pr(>F)") %>% dplyr::filter(pvalue<0.05) %>%nrow()if(nb_signi_axis>0){loads <-scores(rda_model$mod_rda, choices=c(1:(nrow(as.data.frame(rda_model$axis_significance))-1)), display="species")outliers_loads <-lapply(1:nb_signi_axis, function(id.axis){ out <-detectoutliers(loads[,id.axis],3) # function to identify outliers based on their RDA loadings (from Forester et al. 2018)tibble(#axis=rep(id.axis,times=length(out)), # we do not keep the information about the axissnp =names(out),load_meth =TRUE,# loading = out # we do not keep the loadings )}) %>%list_rbind() %>%distinct()# Merge outliers identified by the two methods# ============================================all_outliers <-full_join(outliers_maha,outliers_loads, by="snp") %>%mutate(across(c(maha_meth,load_meth), ~replace_na(.,FALSE)))} else {all_outliers <- outliers_maha %>%mutate(load_meth=FALSE) }all_outliers <-map(all_outliers$snp, function(snp){ cordf <-cor(df[,names(rda_model$vif)],geno_pop[,snp]) %>%as.data.frame() %>%rownames_to_column() %>%pivot_wider(names_from="rowname", values_from ="V1") %>%mutate(max_var =names(which.max(abs(.)))) %>%mutate(max_var_clim =names(which.max(abs(.[,rda_model$selected_var]))))}) %>%setNames(all_outliers$snp) %>%list_rbind(names_to ="snp") %>%inner_join(all_outliers,by="snp")list(rda_model = rda_model,GSout = GSout, # keep the p.values for the Manhattan plotoutliers = all_outliers)}
We plot below the histograms of \(p\)-values and \(q\)-values of the SNPs.
Code
# We load the RDA modelsrda_models <-readRDS(here("outputs/ScotsPine/RDA/RDAmodels.rds"))# we identify the outlier SNPs for the four RDA modelsrda_outliers <- rda_models %>%lapply(function(x) {identify_outliers(rda_model = x, df = rda_df, geno_pop = geno) })
We visualize the outliers with RDA plots following Forester et al. (2018) (see code). Figures are stored in RDAplots_outliers_1.pdf.
Code
# Function to make one RDA plot with different colors for outlier SNPsmake_one_outlier_rda_plot <-function(mod_rda, data_plots, xlim, ylim, second_axis,arrow_length,outline_col){plot(mod_rda, type="n", scaling=3, xlim=xlim, ylim=ylim, choices=c(1,second_axis))points(mod_rda, display="species", pch=21, cex=1, col=data_plots$allsnps$nooutlier_color, bg=data_plots$allsnps$nooutlier_bg , scaling=3, choices=c(1,second_axis))points(mod_rda, display="species", pch=21, cex=1, col=data_plots$allsnps$outlier_color, bg=data_plots$allsnps$outlier_bg, scaling=3, choices=c(1,second_axis))text(mod_rda, scaling=3, display="bp", col="#0868ac", cex=1, arrow.mul=arrow_length, choices=c(1,second_axis))legend("topleft", legend=data_plots$legend_names, bty="n", col=outline_col, pch=21, cex=1, pt.bg=data_plots$legend_colors,ncol=1,title=data_plots$legend_title) recordPlot()}# Function that combines two types of plot:# plots in which outliers are colored according to their most associated climatic variable# plots in which outliers are colored according to the detection method used to identify them:# a method based on the Mahalanobis distances from the center of the RDA space# a method based on the extreme SNP loadings on each significant axiscombine_rda_outlier_plots <-function(rda_out,color_clim_var){# Storing data and options to generate the plots in a listlist_data_plots <-list(clim=list(), # options/data specific to plots with colors based on the climatic variablesmeth=list()) # options/data specific to plots with colors based on the detection methods# Figure options shared between the two types of plot# ===================================================nooutliers_bg <-'#f1eef6'outline_col <-'gray32'transparent_col <-rgb(0,1,0, alpha=0)# Fig options/data specific to plots based on the climatic variables# ==================================================================# Attribute one color to each climatic variable selected_var_colors <-tibble(max_var_clim =names(color_clim_var),outlier_bg = color_clim_var) %>% dplyr::filter(max_var_clim %in% rda_out$rda_model$selected_var)# Generate a dataframe with color information for outliersoutlier_colors <- rda_out$outliers %>% dplyr::select(snp, max_var_clim) %>%left_join(selected_var_colors, by="max_var_clim")# Generate a dataframe with color information for all SNPslist_data_plots$clim$allsnps <-tibble(snp=rownames(rda_out$rda_model$mod_rda$CCA$v)) %>%left_join(outlier_colors, by="snp") %>%mutate(outlier_bg =replace_na(outlier_bg,transparent_col),outlier_color =ifelse(outlier_bg == transparent_col, transparent_col, outline_col),nooutlier_bg =ifelse(outlier_bg == transparent_col, nooutliers_bg, transparent_col),nooutlier_color =ifelse(outlier_bg == transparent_col, outline_col, transparent_col))# Legend optionslist_data_plots$clim$legend_title <-'Climatic variable'list_data_plots$clim$legend_names <- selected_var_colors$max_var_climlist_data_plots$clim$legend_colors <- selected_var_colors$outlier_bg# Fig options/data specific to plots based on the detection methods# =================================================================# Attribute one color to each methodmethod_colors <-c('#FCF926','#3BCD24','#CD24B2') %>%setNames(c("Mahalanobis distance","Axis loadings", "Both"))# Legend optionslist_data_plots$meth$legend_colors <- method_colorslist_data_plots$meth$legend_names <-names(method_colors)list_data_plots$meth$legend_title <-'Outlier detection method'# attribute colors to all SNPslist_data_plots$meth$allsnps <-tibble(snp=rownames(rda_out$rda_model$mod_rda$CCA$v)) %>%left_join(rda_out$outliers[,c("snp", "load_meth", "maha_meth")],by="snp") %>%mutate(outlier_bg =case_when(load_meth ==TRUE& maha_meth ==TRUE~ method_colors["Both"], load_meth ==TRUE& maha_meth ==FALSE~ method_colors["Axis loadings"], load_meth ==FALSE& maha_meth ==TRUE~ method_colors["Mahalanobis distance"],is.na(load_meth) &is.na(maha_meth) ~ transparent_col),outlier_color =ifelse(outlier_bg == transparent_col, transparent_col, outline_col),nooutlier_bg =ifelse(outlier_bg == transparent_col, nooutliers_bg, transparent_col),nooutlier_color =ifelse(outlier_bg == transparent_col, outline_col, transparent_col))# Generate the RDA plots# =====================# For RDA models with 2 explanatory variables, we can only plot RDA1 vs RDA2if(rda_out$rda_model$axis_significance %>%as.data.frame() %>%nrow()-1>2) { # rda models with more than 2 climatic variablespar(mfrow=c(2,2))lapply(2:3, function(second_axis){ # generate two plots: RDA1 vs RDA2 and RDA1 vs RDA3lapply(list_data_plots, function(x) make_one_outlier_rda_plot(mod_rda = rda_out$rda_model$mod_rda,data_plots = x,xlim=c(-1,1), ylim=c(-0.5,0.5),arrow_length=1.25,outline_col=outline_col,second_axis=second_axis)) })} else { # rda models with 2 climatic variablespar(mfrow=c(1,2))lapply(list_data_plots, function(x) make_one_outlier_rda_plot(mod_rda = rda_out$rda_model$mod_rda,data_plots = x,xlim=c(-0.3,0.3), ylim=c(-0.5,0.5),outline_col=outline_col,arrow_length=0.6,second_axis=2)) }title(rda_out$rda_model$form_rda, line=-3,outer =TRUE) # rda model formula as title}
Code
# Define the colors associated with each climatic variable for the following visualizationscolor_clim_var <-sample(c('#1f78b4','#a6cee3','#6a3d9a','#e31a1c','#33a02c','#ffff33','#fb9a99','#FF7FE6', '#FFB675', '#51FF74','#B4FF32'), length(clim_var_avg_selected), replace=F) %>%setNames(clim_var_avg_selected)
Code
# Plot RDA outliers with different colors based either on the detection method or the most associated climatic variablepdf(width =12, height =8, here("figs/ScotsPine/RDA/RDAplots_outliers_1.pdf"))lapply(rda_outliers, function(x) combine_rda_outlier_plots(rda_out = x,color_clim_var = color_clim_var))dev.off()
2.0.3.2 RDA plots based on Capblancq and Forester (2021)
We visualize the outliers with RDA and Manhattan plots following Capblancq and Forester (2021) (code). Figures are stored in RDAplots_outliers_2.pdf.
Code
make_rda_ggplot <-function(data_plots,TAB_var){ data_plots$tab[order(data_plots$tab$snp_type),] %>%ggplot() +geom_hline(yintercept=0, linetype="dashed", color =gray(.80), linewidth=0.6) +geom_vline(xintercept=0, linetype="dashed", color =gray(.80), linewidth=0.6) +geom_point(aes(x=RDA1*20, y=RDA2*20, colour = snp_type), size =1.4) +scale_color_manual(values = data_plots$legend_colors) +geom_segment(data = TAB_var, aes(xend=RDA1, yend=RDA2, x=0, y=0), colour="black", linewidth=0.15, linetype=1, arrow=arrow(length =unit(0.02, "npc"))) +geom_text(data = TAB_var, aes(x=1.1*RDA1, y=1.1*RDA2, label =row.names(TAB_var)), size =2.5) +xlab("RDA 1") +ylab("RDA 2") +facet_wrap(~"RDA space") +guides(color=guide_legend(title=data_plots$legend_title)) +theme_bw(base_size =11) +theme(panel.background =element_blank(), legend.background =element_blank(), panel.grid =element_blank(), plot.background =element_blank(), legend.text=element_text(size=rel(.8)), strip.text =element_text(size=11))}make_manhattan_plot <-function(data_plots){ data_plots$tab[order(data_plots$tab$snp_type),] %>%ggplot() +geom_hline(yintercept=-log10(0.01/length(data_plots$tab$pvalues)), linetype="dashed", color =gray(.80), linewidth=0.6) +geom_point(aes(x=pos, y=-log10(pvalues), col = snp_type), size=1.4) +scale_color_manual(values = data_plots$legend_colors) +xlab("Loci") +ylab("-log10(p-values)") +facet_wrap(~"Manhattan plot", nrow =3) +guides(color=guide_legend(title=data_plots$legend_title)) +theme_bw(base_size =11) +theme(legend.position="right", legend.background =element_blank(), panel.grid =element_blank(), legend.box.background =element_blank(), plot.background =element_blank(), panel.background =element_blank(), legend.text=element_text(size=rel(.8)), strip.text =element_text(size=11)) }# Function that combines two types of plot:# plots in which outliers are colored according to their most associated climatic variable# plots in which outliers are colored according to the detection method used to identify them:# a method based on the Mahalanobis distances from the center of the RDA space# a method based on the extreme SNP loadings on each significant axiscombine_rda_outlier_ggplots <-function(rda_out,color_clim_var){# Storing data and options to generate the plots in a listlist_data_plots <-list(clim=list(), # options/data specific to plots with colors based on the climatic variablesmeth=list()) # options/data specific to plots with colors based on the detection methods# Figure options shared between the two types of plot# ===================================================nooutliers_bg <-"gray90"nooutliers_legend <-"No detection"locus_scores <-scores(rda_out$rda_model$mod_rda, choices=c(1:2), # we use K=2 number of axesdisplay="species", # 'species' in the vegan package correspond to the lociscaling="none") %>%as.data.frame() %>%rownames_to_column("snp") %>%as_tibble() %>%inner_join(rda_out$GSout, by="snp") %>%mutate(pos=seq(1:nrow(.)))TAB_var <-as.data.frame(scores(rda_out$rda_model$mod_rda, choices=c(1,2), display="bp"))# Fig options/data specific to plots based on the climatic variables# ==================================================================# Legend optionsvec_color_clim_var <- color_clim_var[names(color_clim_var) %in%unique(rda_out$outliers$max_var_clim)]list_data_plots$clim$legend_title <-'Climatic variable'list_data_plots$clim$legend_names <-c(nooutliers_legend,names(vec_color_clim_var))list_data_plots$clim$legend_colors <-c(nooutliers_bg,vec_color_clim_var) %>%unname()list_data_plots$clim$tab <- locus_scores %>%left_join(rda_out$outliers[,c("snp", "max_var_clim")],by="snp") %>%mutate(max_var_clim =replace_na(max_var_clim,nooutliers_legend)) %>%mutate(max_var_clim =factor(max_var_clim,levels=list_data_plots$clim$legend_names)) %>% dplyr::rename(snp_type = max_var_clim)# Fig options/data specific to plots based on the detection methods# =================================================================# Legend optionslist_data_plots$meth$legend_colors <-c(nooutliers_bg,'#FCF926','#3BCD24','#CD24B2')list_data_plots$meth$legend_names <-c(nooutliers_legend,"Mahalanobis distance","Axis loadings", "Both")list_data_plots$meth$legend_title <-'Outlier detection'list_data_plots$meth$tab <- locus_scores %>%left_join(rda_out$outliers[,c("snp", "load_meth", "maha_meth")],by="snp") %>%mutate(snp_type =case_when(load_meth ==TRUE& maha_meth ==TRUE~ list_data_plots$meth$legend_names[[4]], load_meth ==TRUE& maha_meth ==FALSE~ list_data_plots$meth$legend_names[[3]], load_meth ==FALSE& maha_meth ==TRUE~ list_data_plots$meth$legend_names[[2]],is.na(load_meth) &is.na(maha_meth) ~ list_data_plots$meth$legend_names[[1]])) %>%mutate(snp_type=factor(snp_type,levels=list_data_plots$meth$legend_names)) # Generate the RDA plots# =====================p1 <-lapply(list_data_plots, function(x) make_rda_ggplot(data_plots = x,TAB_var=TAB_var))p2 <-lapply(list_data_plots, function(x) make_manhattan_plot(data_plots = x))plot_row <-plot_grid(p1$clim, p1$meth,p2$clim,p2$meth, ncol=2)# Title optionstitle <-ggdraw() +draw_label( rda_out$rda_model$form_rda,fontface ='bold',x =0,hjust =0 ) +theme(# add margin on the left of the drawing canvas,# so title is aligned with left edge of first plotplot.margin =margin(0, 0, 0, 7) )# merge title and plotsplot_grid( title, plot_row,ncol =1,# rel_heights values control vertical title marginsrel_heights =c(0.1, 1))}
Code
# Plot RDA outliers with different colors based either on the detection method or the most associated climatic variablepdf(width =12, height =8, here("figs/ScotsPine/RDA/RDAplots_outliers_2.pdf"))lapply(rda_outliers, function(x) combine_rda_outlier_ggplots(rda_out = x, color_clim_var=color_clim_var))dev.off()
The figures RDA_outlier_plots_pRDA.pdf and RDA_outlier_plots_pRDA.pdf generated below show only the outlier SNPs identified based on the Mahanobilis distances along the first two axes and a FDR threshold of 5% (i.e., the outlier SNPs that are then used to predict the genomic offset).
Code
# =========================================# Function to generate the SI outlier plots# =========================================generate_SI_outlier_plots <-function(rda_out){locus_scores <-scores(rda_out$rda_model$mod_rda, choices=c(1:2), # we use K=2 number of axesdisplay="species", # 'species' in the vegan package correspond to the lociscaling="none") %>%as.data.frame() %>%rownames_to_column("snp") %>%as_tibble() %>%inner_join(rda_out$GSout, by="snp") %>%mutate(pos=seq(1:nrow(.)))TAB_var <-as.data.frame(scores(rda_out$rda_model$mod_rda, choices=c(1,2), display="bp"))# Legend optionslegend_colors <-c("gray90",'#CD24B2')legend_names <-c("Non-outliers","Outliers")legend_title <-'Outlier detection'tab <- locus_scores %>%left_join(rda_out$outliers[,c("snp", "maha_meth")],by="snp") %>%mutate(snp_type =case_when(maha_meth ==TRUE~ legend_names[[2]],is.na(maha_meth) | maha_meth==FALSE~ legend_names[[1]])) %>%mutate(snp_type=factor(snp_type,levels=legend_names)) # Plot with RDA space# ===================p1 <- tab[order(tab$snp_type),] %>%ggplot() +geom_hline(yintercept=0, linetype="dashed", color =gray(.80), linewidth=0.6) +geom_vline(xintercept=0, linetype="dashed", color =gray(.80), linewidth=0.6) +geom_point(aes(x=RDA1*20, y=RDA2*20, colour = snp_type), size =1.4) +scale_color_manual(values = legend_colors) +geom_segment(data = TAB_var, aes(xend=RDA1, yend=RDA2, x=0, y=0), colour="chartreuse3", linewidth=0.15, linetype=1, arrow=arrow(length =unit(0.02, "npc"))) +geom_text(data = TAB_var, aes(x=0.9*RDA1, y=1.1*RDA2, label =row.names(TAB_var)), size =2.5, colour="chartreuse4") +xlab("RDA 1") +ylab("RDA 2") +facet_wrap(~"RDA space") +theme_bw(base_size =11) +theme(panel.background =element_blank(), legend.position ="none",legend.background =element_blank(), panel.grid =element_blank(), plot.background =element_blank(), legend.text=element_text(size=rel(.8)), strip.text =element_text(size=11))# Manhattan plot# ==============p2 <- tab[order(tab$snp_type),] %>%ggplot() +geom_hline(yintercept=-log10(0.01/length(tab$pvalues)), linetype="dashed", color =gray(.80), linewidth=0.6) +geom_point(aes(x=pos, y=-log10(pvalues), col = snp_type), size=1.4) +scale_color_manual(values = legend_colors) +xlab("Loci") +ylab("-log10(p-values)") +facet_wrap(~"Manhattan plot", nrow =3) +theme_bw(base_size =11) +theme(legend.position="right", panel.grid =element_blank(), legend.title=element_blank(),legend.box.background =element_blank(), plot.background =element_blank(), panel.background =element_blank(), legend.text=element_text(size=rel(.8)), strip.text =element_text(size=11))# Merge the two plots# ===================p2_legend <-get_legend(p2)p2 <- p2 +theme(legend.position ="none")plot_row <-plot_grid(p1,p2, ncol=2)plot_row <-plot_grid(plot_row, p2_legend, ncol=2,rel_widths =c(1, 0.1))# Save the plots as pdf# =====================if(grepl("Condition",rda_out$rda_model$form_rda)==TRUE){rda_name <-"pRDA"} else { rda_name <-"RDA"}ggsave(plot_row, device ="pdf", filename =here(paste0("figs/ScotsPine/RDA/RDA_outlier_plots_",rda_name,".pdf")), width=12, height=5)}# Run the function for the RDA models fitted on the set 2 of climatic variableslapply(rda_outliers, function(x) generate_SI_outlier_plots(rda_out = x))
Warning in get_plot_component(plot, "guide-box"): Multiple components found; returning the first one. To return all, use `return_all = TRUE`.
Warning in get_plot_component(plot, "guide-box"): Multiple components found; returning the first one. To return all, use `return_all = TRUE`.
2.0.4 Common outliers btw RDA and pRDA
The Venn diagram below shows the common outliers among those identified with RDA and pRDA (considering all outliers, i.e. also those identified based on their loadings along the significant axes)
The Venn diagram below shows the common outliers between those identified with RDA and pRDA (considering only outliers identified with the Mahalanobis distances along the first two axes, i.e. those that will be used for genomic offset predictions).
climdf <-readRDS(here("data/ScotsPine/subdf.rds")) %>% dplyr::filter(rcp_scenario == rcp_scenario_i & ensemble_member =="01") %>% dplyr::select(-clim_var, -time_averages) %>%pivot_wider(names_from=clim_var_avg, values_from =value)clim_ref <- climdf %>% dplyr::filter(timeslice =="1980-2000")clim_pred <- climdf %>% dplyr::filter(timeslice == timeslice_pred)# Scaling the datasetsscale_params <-lapply(clim_var_avg_selected, function(x){ vec_var <- clim_ref[,x] %>%pull()list(mean =mean(vec_var),sd =sd(vec_var)) }) %>%setNames(clim_var_avg_selected)# Scale climatic data across the reference periodclim_ref <- clim_ref %>% dplyr::mutate(across(any_of(clim_var_avg_selected), ~ (. -mean(.)) /sd(.)))# Scale climatic data across the future periodfor(i in clim_var_avg_selected){ clim_pred[,i] <- (clim_pred[,i] - scale_params[[i]]$mean) / scale_params[[i]]$sd}
3.2 RDA models
Code
runRDA <-function(snp_set, clim_var, clim_df, geno){# Keep the genomic data of the selected SNPsgeno <- geno[,snp_set]# RDA formula form_rda <-paste("geno ~ ", paste(clim_var,collapse=" + "))# run RDA with only the selected SNPsrda_outliers <-rda(as.formula(form_rda), clim_df)}
# Function to calculate the RDA genetic offset for spatial points# ===============================================================# clim_ref = climatic data used to fit the RDA model (the climate-of-origin of the populations)# clim_pred = climatic data used for predictions (so either the future climate of the populations, # or climate of the common gardens or the NFI plots)# weights = if NULL, the adaptive index is not weighted by the relative importance of the RDA axes in # explaining the variance.# K = number of RDA axes used to calculate the genomic offset# snp_set = list with at least the RDA models pred_GO_spatial_points <-function(snp_set, K, clim_ref, clim_pred, clim_var,weights =NULL, CG=F){# weights based on the variance explained by the different axesweights <- (snp_set$rda_model$CCA$eig/sum(snp_set$rda_model$CCA$eig))[1:K]list_AI <-lapply(list(clim_ref,clim_pred), function(df){lapply(1:K, function(i){# Below we calculate the adaptive index for the axis i# For that, we multiply the value of the variables at the location of the population# by the loadings of the variables along the axis iai <- df %>% dplyr::select(any_of(clim_var)) %>%apply(1, function(x) sum(x * snp_set$rda_model$CCA$biplot[,i]))if(!is.null(weights)){ai <- ai * weights[i]} }) %>%setNames(paste0("RDA", 1:length(.))) %>%as.data.frame()})if(CG==F){eucloffset <-unlist(lapply(1:nrow(list_AI[[1]]), function(x) dist(rbind(list_AI[[1]][x,], list_AI[[2]][x,]), method ="euclidean")))} elseif(CG==T){eucloffset <-lapply(1:nrow(list_AI[[2]]), function(x) as.matrix(dist(rbind(list_AI[[2]][x,],list_AI[[1]]), method ="euclidean"))[2:(nrow(list_AI[[1]])+1),1]) %>%setNames(clim_pred[,1] %>%pull()) %>%as.data.frame() %>%cbind(clim_ref[,1]) }return(eucloffset)}
We plot the genomic offset predictions at the location of the 21 Scots pine populations.
Code
tibble(PopulationCode = clim_ref$PopulationCode,GO = rda_list$RDA$GO) %>%make_spatialpoints_map(var="GO",ggtitle="Genomic offset predictions without correction for population structure")
Code
tibble(PopulationCode = clim_ref$PopulationCode,GO = rda_list$pRDA$GO) %>%make_spatialpoints_map(var="GO",ggtitle="Genomic offset predictions with correction for population structure")
Correlation between genomic offset with or without correction for population structure.
Code
cor(rda_list$RDA$GO,rda_list$pRDA$GO)
[1] 0.8537676
References
Capblancq, Thibaut, and Brenna R. Forester. 2021. “Redundancy Analysis: A Swiss Army Knife for Landscape Genomics.”Methods in Ecology and Evolution 12 (12): 2298–2309. https://doi.org/10.1111/2041-210X.13722.
Capblancq, Thibaut, Keurcien Luu, Michael G. B. Blum, and Eric Bazin. 2018. “Evaluation of Redundancy Analysis to Identify Signatures of Local Adaptation.”Molecular Ecology Resources 18 (6): 1223–33. https://doi.org/10.1111/1755-0998.12906.
Capblancq, Thibaut, Xavier Morin, Maya Gueguen, Julien Renaud, Stéphane Lobreaux, and Eric Bazin. 2020. “Climate-Associated Genetic Variation in Fagus Sylvatica and Potential Responses to Climate Change in the French Alps.”Journal of Evolutionary Biology 33 (6): 783–96. https://doi.org/10.1111/jeb.13610.
Forester, Brenna R., Jesse R. Lasky, Helene H. Wagner, and Dean L. Urban. 2018. “Comparing Methods for Detecting Multilocus Adaptation with Multivariate Genotype–Environment Associations.”Molecular Ecology 27 (9): 2215–33. https://doi.org/10.1111/mec.14584.
François, Olivier, Helena Martins, Kevin Caye, and Sean D. Schoville. 2016. “Controlling False Discoveries in Genome Scans for Selection.”Molecular Ecology 25 (2): 454–69. https://doi.org/10.1111/mec.13513.