Scots pine genomic data

Formatting, filtering, imputation of missing data, genetic relationships

Author

Juliette Archambeau

Published

Invalid Date

1 Raw genomic data

1.1 Loading

Here are the 10 first rows and columns of the original genomic dataset.

Code
# Load the raw genomic data:
raw_data <- read_table(here("data/ScotsPine/GenomicData/data_update_Fev2025/raw_data/PineTrial_genotypes.txt"),show_col_types=F) %>%
  dplyr::select(probeset_id,sort(colnames(.)))

# viz
raw_data[1:10,1:10] %>% kable_mydf(boldfirstcolumn = T, font_size = 11)
probeset_id 5001 5002 5004 5005 5006 5007 5008 5009 5010
AX-117395395 AG GG AA GG AG AG AG GG GG
AX-117395404 GG AG AG AG AG AG AA AA AA
AX-117395409 GG GG GG GG TG TG GG GG GG
AX-117395416 AG GG GG GG GG GG GG GG GG
AX-117395425 TT TG TG TG GG GG TG GG GG
AX-117395432 AA AA NN AA AA AA AA AA AA
AX-117395439 CC AC CC AC AC AC AC AC CC
AX-117395441 CG CG CG GG GG CC GG GG CG
AX-117395444 GG AG GG GG GG AG GG GG AG
AX-117395445 GG GG CG CG CG CG CG CC GG

The original genomic dataset includes 1761 individuals (columns) and 36433 SNPs (rows).

Code
# We apply a function to each column (i.e SNP) of the raw genomic data, 
# after removing the first two columns, i.e. the clone ID and the assay in which the clone was genotyped

data <- raw_data %>% column_to_rownames(var="probeset_id") %>% t() %>% as.data.frame()

list_alleles <- lapply(colnames(data), function(x){ 
  
  vec <- data[,x]
  
  tab <- table(vec) %>%  as.matrix() %>%  t() %>% as.data.frame() %>% dplyr::select(-contains("NN"))
  
  
  if(length(colnames(tab))==1){         # monomorphic case, e.g. only A/A
    major <- names(which.max(tab))
    
    case_when(vec == major ~ 0) 
    
  } else if(length(colnames(tab))==2){  # case where there is no homozygote with the rare allele, e.g. A/A and A/G
    
    major <- names(which.max(tab))
    mid <- names(which.min(tab))
    
    case_when(vec == major ~ 0,
              vec == mid   ~ 1) 
    
  } else if (length(colnames(tab))==3){ # common case, e.g. A/A, A/G and G/G
    
    if(tab[,1]==tab[,3]){               # to account for cases where there is the same number of A/A and G/G 
      major <- colnames(tab)[1]
      minor <- colnames(tab)[3]
      } else {
    major <- names(which.max(tab[,c(1,3)]))
    minor <- names(which.min(tab[,c(1,3)]))
      }
    mid <- setdiff(names(tab),c(major,minor))  
    
    case_when(vec == major ~ 0,
              vec == minor ~ 2,
              vec == mid   ~ 1) 
  }
}) %>% setNames(colnames(data)) # re-attribute the SNP ID to list names

df <- list_alleles %>% 
  as_tibble() %>% 
  t() %>% 
  set_colnames(rownames(data)) %>% # attribute individual ID
  as.data.frame()

# save the dataset
df %>% saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/unfiltered_allele_counts.rds"))

Here are the 10 first rows and columns of the formatted dataset.

Code
rm(raw_data)
df <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/unfiltered_allele_counts.rds"))

# viz
df[1:10,1:10] %>% kable_mydf(boldfirstcolumn = T, font_size = 11)
5001 5002 5004 5005 5006 5007 5008 5009 5010 5011
AX-117395395 1 0 2 0 1 1 1 0 0 1
AX-117395404 2 1 1 1 1 1 0 0 0 1
AX-117395409 0 0 0 0 1 1 0 0 0 0
AX-117395416 1 0 0 0 0 0 0 0 0 0
AX-117395425 2 1 1 1 0 0 1 0 0 1
AX-117395432 0 0 NA 0 0 0 0 0 0 0
AX-117395439 0 1 0 1 1 1 1 1 0 2
AX-117395441 1 1 1 0 0 2 0 0 1 0
AX-117395444 0 1 0 0 0 1 0 0 1 1
AX-117395445 0 0 1 1 1 1 1 2 0 1

2 Filtering genomic data

2.1 MAC and monomorphic SNPs

Code
# function to calculate the minor allele counts
sum_minor_allele <- function(x){df %>% 
  t() %>% 
  as_tibble() %>% 
  sapply(function(x) sum(x,na.rm=T)) %>% 
  as.data.frame() %>%
  set_colnames(c("sum")) %>% 
  rownames_to_column(var = "snp") %>% 
  as_tibble() 
}
Code
mono <- df %>% sum_minor_allele %>% filter(sum==0)
low_mac <- df %>% sum_minor_allele %>% filter(sum %in% c(1,2))

53 SNPs are monomorphic. We remove them.

143 SNPs have one or two minor allele count (MAC). We also remove these SNPs as the minor alleles may be due to PCR or sequencing errors.

Code
df <- df[!(row.names(df) %in% mono$snp),]
df <- df[!(row.names(df) %in% low_mac$snp),]

After filtering monomorphic SNPs and MAC, 36237 SNPs left.

2.2 NAs and MAF

2.2.1 Missing data per clone

Code
tab <- df %>% 
  as_tibble() %>% 
  sapply(function(x) sum(is.na(x))*100/nrow(df)) %>% 
  as.data.frame() %>%
  set_colnames(c("na_freq")) %>% 
  rownames_to_column(var = "clone") %>% 
  as_tibble() %>% 
  arrange(na_freq) %>% 
  mutate(id=1:nrow(.))

plot(tab$id,tab$na_freq,
     pch=20,
     xlab="Individual number",
     ylab="Percentage of missing data per individual")

All individuals have less than 4.33% missing data. Therefore, we do not apply a filter of missing data per individual.

2.2.2 Minor allele frequencies

We want to remove minor allele frequencies (MAF) for gene-environment association analyses, but not for population genetic structure estimation. Therefore, we produce two datasets: one with MAF and the other without.

Code
tab <- df %>% 
  t() %>% 
  as_tibble() %>% 
  sapply(function(x) sum(x,na.rm=T)*100/(2*nrow(.))) %>% 
  as.data.frame() %>%
  set_colnames(c("MAF")) %>% 
  rownames_to_column(var = "snp") %>% 
  as_tibble() %>% 
  arrange(MAF) %>% 
  mutate(id=1:nrow(.))

plot(tab$id,tab$MAF,
     pch=20,
     xlab="SNP number",
     ylab="Minor allele frequencies")

552 SNPs have MAF < 1%. 3935 SNPs have MAF < 5%.

Code
# We export the two datasets in a list.
maf_threshold <- 5
list_geno <- list(list_withmaf=list(df = df),
                  list_withoutmaf=list(df = df[!(row.names(df) %in% tab[tab$MAF<maf_threshold,]$snp),]))

In the dataset without MAF, we remove the SNPs with MAF < 5%: there are 32302 SNPs left.

2.2.3 Missing data per SNP

Code
identify_NA_perSNP <- function(x){
  
  x$df %>% 
    t() %>% 
    as_tibble() %>% 
    sapply(function(x) sum(is.na(x))*100/nrow(.)) %>% 
    as.data.frame() %>%
    set_colnames(c("na_freq")) %>% 
    rownames_to_column(var = "snp") %>% 
    as_tibble() %>% 
    arrange(na_freq) %>% 
    mutate(id=1:nrow(.))
}

NAfreq_per_SNP <- lapply(list_geno,identify_NA_perSNP)

In the dataset without MAF, the maximum percentage of missing data per SNP is 3.86%.

In the dataset with MAF, the maximum percentage of missing data per SNP is 3.86%.

Therefore, we do not apply a filter for the percentage of missing data per SNP and we keep all the SNPs.

Code
plot(NAfreq_per_SNP$list_withoutmaf$id,NAfreq_per_SNP$list_withoutmaf$na_freq,
     pch=20,
     xlab="SNP number",
     ylab="Percentage of missing data per SNP",
     main="Genomic dataset without MAF")

Code
plot(NAfreq_per_SNP$list_withmaf$id,NAfreq_per_SNP$list_withmaf$na_freq,
     pch=20,
     xlab="SNP number",
     ylab="Percentage of missing data per SNP",
     main="Genomic dataset with MAF")

After the filtering steps:

  • 36237 SNPs and 1761 individuals in the dataset not filtered for MAF.

  • 32302 SNPs and 1761 individuals in the dataset filtered for MAF.

2.3 Exporting the filtered datasets

When exporting the datasets, we add the population and family information.

In February 2025, the family assignment of some individuals were modified. We load the original family and populations assignments and update it with the new assignments.

Code
# Original family and population assignement
field_data <- read_excel(here("data/ScotsPine/Field.xlsx"), na = "NA") %>% 
  dplyr::select(PopulationCode,Family, FieldCode) 

# Updated family assignement
fam_rea <- read_table(here("data/ScotsPine/GenomicData/data_update_Fev2025/raw_data/FamilyReassignment.txt"),show_col_types = FALSE)

# With the code below, we see that the "family" column of "Field.xlsx" and the "OriginalFamily" column of "FamilyReassignment.txt" are the same, which is necessary.
# field_data %>% 
#   left_join(fam_rea, by="FieldCode") %>%  
#   filter(OriginalFamily != AssignedFamily | OriginalFamily != Family | AssignedFamily != Family) 

# We are going to use the "AssignedFamily" column (as advised by Annika Perry)
field_data <- field_data %>% 
  left_join(fam_rea, by="FieldCode") %>% 
  dplyr::select(-Family, -OriginalFamily) %>% 
  dplyr::rename(Family = AssignedFamily) %>% 
  mutate(FieldCode = as.character(FieldCode))

We format the two datasets with SNPs in columns and individuals in rows. And we add two columns with information about the family and the population of each individual.

We show below the first 10 rows and columns of the dataset not filtered for MAF.

Code
list_geno <- lapply(list_geno, function(x){
  
  # we invert the dataset with tidyverse (quite long)
  df <- x$df %>% 
  rownames_to_column("SNP") %>%
  pivot_longer(-SNP, names_to = "FieldCode", values_to = "Value") %>%
  pivot_wider(names_from = SNP, values_from = Value)

  # we merge the two datasets
  field_data %>% right_join(df, by="FieldCode")
  
})

list_geno$list_withmaf[1:10,1:10] %>% kable_mydf()
PopulationCode FieldCode Family AX-117395395 AX-117395404 AX-117395409 AX-117395416 AX-117395425 AX-117395432 AX-117395439
GC 5001 GC5 1 2 0 1 2 0 0
GE 5002 GE9 0 1 0 0 1 0 1
RM 5004 RM7 2 1 0 0 1 NA 0
SO 5005 SO2 0 1 0 0 1 0 1
MG 5006 MG10 1 1 1 0 0 0 1
BB 5007 BB8 1 1 1 0 0 0 1
CR 5008 CR5 1 0 0 0 1 0 1
GT 5009 GT8 0 0 0 0 0 0 1
CC 5010 CC7 0 0 0 0 0 0 0
SD 5011 SD7 1 1 0 0 1 0 2

2.3.1 Allele counts

We export allele counts.

Code
# we save the dataset not filtered for MAF
saveRDS(list_geno$list_withmaf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))

# we save the dataset without MAF
saveRDS(list_geno$list_withoutmaf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds"))

2.3.2 Allele frequencies

We calculate the population allele frequencies.

Code
# Function to calculate the population allele frequencies
compute_allele_freq <- function(x){
  x %>% 
    dplyr::select(PopulationCode, contains("AX")) %>% 
    group_by(PopulationCode) %>% 
    summarise_all(~sum(., na.rm = TRUE)/((n()-sum(is.na(.)))*2))
}
Code
# For the dataset with MAF
list_geno$list_withmaf %>% 
  compute_allele_freq %>% 
  saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withMAF.rds"))

# For the dataset without MAF
list_geno$list_withoutmaf %>% 
  compute_allele_freq %>% 
  saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withoutMAF.rds"))

3 Imputation of missing data

We impute missing data based on the most common genotype within the family.

We first impute missing data in the dataset not filtered for MAF.

Code
# we load filtered genomic data not filtered for MAF
df <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))

# Function to calculate mode (i.e., most common value)
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Replace NA values with the mode within each family for each SNP
impdf <- df %>%
  group_by(Family) %>%
  mutate(across(starts_with("AX"), ~replace(., is.na(.), Mode(.[!is.na(.)])))) %>% 
  ungroup()

But, there are 8 NAs left. Here are the two families and populations they belong to.

Code
impdf %>%  
  dplyr::select(PopulationCode, FieldCode, Family, where(~ any(is.na(.)))) %>% # Select columns with NA values
  filter(if_any(everything(), is.na)) %>% # selects rows with NA values
  kable_mydf
PopulationCode FieldCode Family AX-388239058 AX-388336314
CR 7008 CR8 0 NA
CR 7047 CR8 0 NA
GT 7129 GT4 NA 0
GT 7182 GT4 NA 0
CR 7441 CR8 0 NA
GT 7442 GT4 NA 0
CR 7472 CR8 0 NA
GT 7496 GT4 NA 0

We look at the counts of each allele in the two families with NAs remaining after imputation.

Code
impdf %>%  dplyr::select(PopulationCode, FieldCode, Family, where(~ any(is.na(.)))) %>% # Selects columns where there are NA values
  filter(PopulationCode %in% c("CR", "GT")) %>%
  pivot_longer(cols = starts_with("AX"), names_to = "SNP", values_to = "allele_value") %>% 
  group_by(PopulationCode, allele_value) %>% 
  summarize(allele_count = n(), .groups = "drop") %>% 
  filter(allele_value %in% c(0, 1, 2)) %>%  # Include only counts for 0, 1, and 2
  kable_mydf()
PopulationCode allele_value allele_count
CR 0 132
CR 1 30
CR 2 6
GT 0 129
GT 1 27
GT 2 4

Based on this, we attribute the value of 0 to the allele AX-388239058 in family GT4 and to the allele AX-388336314 in family CR8.

Code
impdf <- impdf %>%
  mutate(
    `AX-388239058` = case_when(
      Family == "GT4" ~ 0,  # Assign 0 when Family is GT4
      TRUE ~ `AX-388239058` # Retain original value otherwise
    ),
    `AX-388336314` = case_when(
      Family == "CR8" ~ 0,  # Assign 0 when Family is CR8
      TRUE ~ `AX-388336314` # Retain original value otherwise
    )
  )

# We save the dataset
saveRDS(impdf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withMAF.rds"))

We then subset the dataset by keeping only SNPs without MAF (to get an imputed dataset without MAF).

Code
impdf <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withMAF.rds"))

# Extract SNPs filtered for MAF
snps_dfnomaf <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds")) %>% 
  dplyr::select(-PopulationCode,-Family, -FieldCode) %>% 
  colnames()

# Save dataset filtered for MAF and with imputed missing data
impdf %>% 
  dplyr::select(PopulationCode, Family, FieldCode, any_of(snps_dfnomaf)) %>% 
  saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withoutMAF.rds"))

4 Genetic relatedness coefficients

We use the genomic data not filtered for MAF and with no imputation of missing data to estimate the genetic relatedness coefficients \(r_{ij}\). Assuming no inbreeding, \(r_{ij}\) can be interpreted as the proportion of the genome that is shared between individuals \(i\) and \(j\).

Code
# we load genomic data not filtered for MAF and without imputation for missing data
df <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))

# change the dataset to a matrix
mat <- df %>% 
  dplyr::select(-Family, -PopulationCode) %>% 
  column_to_rownames("FieldCode") %>% 
  as.matrix()

# We estimate the GRM with the VanRaden method
Gmat_VanRaden <- Gmatrix(SNPmatrix=mat, 
                         missingValue=NA, 
                         method="VanRaden")

# We save the G matrix
saveRDS(Gmat_VanRaden, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/Gmat_VanRaden.rds"))

We extract the genetic relatedness coefficients for each pair of individuals.

In the tutorial GWAS 5: Relatedness and population structure by Matti Pirine (University of Helsinki), the authors refer to cutting points to assign pairs of individuals to a certain degree of relatedness based on their \(r_{ij}\). These thresholds are “a commonly-used inference procedure of the KING software” and are based on the powers of \(1/2\). We will use these thresholds to assign the pairs of individuals to a given degree of relatedness (i.e., identical, 1st degree, 2nd degree, 3rd degree or unrelated).

Code
# we load the GRM
Gmat_VanRaden <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/Gmat_VanRaden.rds"))

# viz the matrix
Gmat_VanRaden[1:10,1:10] %>% 
  as.data.frame() %>% 
  kable_mydf(boldfirstcolumn = TRUE, round_number = 4)
5001 5002 5004 5005 5006 5007 5008 5009 5010 5011
5001 1.0018 0.0065 0.0060 -0.0033 -0.0130 -0.0103 -0.0055 -0.0099 0.0148 0.0064
5002 0.0065 1.0056 0.0299 0.0097 -0.0249 -0.0018 -0.0072 -0.0092 -0.0014 0.0037
5004 0.0060 0.0299 1.0142 0.0001 -0.0099 0.0018 -0.0013 -0.0058 -0.0153 -0.0124
5005 -0.0033 0.0097 0.0001 1.0031 -0.0132 0.0050 0.0085 -0.0074 -0.0224 0.0124
5006 -0.0130 -0.0249 -0.0099 -0.0132 1.0103 0.0020 -0.0168 -0.0029 -0.0055 0.0018
5007 -0.0103 -0.0018 0.0018 0.0050 0.0020 1.0094 -0.0132 0.0004 -0.0045 0.0229
5008 -0.0055 -0.0072 -0.0013 0.0085 -0.0168 -0.0132 0.9954 0.0027 0.0026 -0.0086
5009 -0.0099 -0.0092 -0.0058 -0.0074 -0.0029 0.0004 0.0027 1.0081 -0.0155 -0.0230
5010 0.0148 -0.0014 -0.0153 -0.0224 -0.0055 -0.0045 0.0026 -0.0155 1.0593 -0.0063
5011 0.0064 0.0037 -0.0124 0.0124 0.0018 0.0229 -0.0086 -0.0230 -0.0063 1.0340
Code
# Function to extract the genetic relatedness coefficients in a vector
extract_relatedness_coeffs_from_GRM <- function(GRM){
# Extract upper triangle of the matrix
upper_triangle <- upper.tri(GRM, diag=TRUE)

# Set lower triangle elements to NA
GRM[!upper_triangle] <- NA

# Convert the resulting matrix to a dataframe
df <- as.data.frame(as.table(GRM))

# Remove rows with NA values (optional if you want to remove NA values)
df <- na.omit(df)

# Rename the columns
names(df) <- c("Ind1", "Ind2", "r")

df
}


# Run the function on our GRM
df_r <- extract_relatedness_coeffs_from_GRM(GRM=Gmat_VanRaden) %>% arrange(r)

# Define the thresholds to separate the degrees of relatedness
KING_thresholds <- c(0.088, 0.177, 0.354, 0.707)

# We count the number of individual pairs within each degree of relatedness 
r_counts <- df_r %>%
  filter(!Ind1==Ind2) %>% # we remove the genetic relatedness coeffs calculated by comparing the same individuals
  summarize(
    identical = sum(r > KING_thresholds[[4]]),
    first_degree = sum(r <= KING_thresholds[[4]] & r > KING_thresholds[[3]]),
    second_degree = sum(r <= KING_thresholds[[3]] & r > KING_thresholds[[2]]),
    third_degree = sum(r <= KING_thresholds[[2]] & r > KING_thresholds[[1]]),
    unrelated = sum(r <= KING_thresholds[[1]])
  )

# we prepare the dataset for the plot 
dfplot <- df_r %>% 
  filter(!Ind1==Ind2) %>% # 
  filter(r>0.05) %>% # to aid visualization and speed up plotting (it was long if I keep all the pairs)
  mutate(Index=1:nrow(.))

# Define the breaks and labels for the y-scale
y_breaks <- c(0, 0.25, 0.5, 0.75, 1, KING_thresholds)
y_labels_colors <- ifelse(y_breaks %in% KING_thresholds, "orange", "black")

# Plot
ggplot(dfplot, aes(x = Index, 
                   y = r, 
                   color = cut(r, breaks = c(-Inf, KING_thresholds, Inf),
                               labels = c(paste0("Unrelated (n = ",r_counts$unrelated,")"), 
                                          paste0("3rd degree (n = ",r_counts$third_degree,")"), 
                                          paste0("2nd degree (n = ",r_counts$second_degree,")"), 
                                          paste0("1st degree (n = ",r_counts$first_degree,")"), 
                                          paste0("Identical (n = ",r_counts$identical,")"))))) +
  geom_point() +
  geom_hline(yintercept = KING_thresholds, linetype = "dashed", color = "orange") +
  labs(x = "Pairs of individuals", y = "Genetic relatedness coefficient") +
  theme_bw() +
  scale_color_manual(values = c("blue", "green", "deeppink", "purple", "yellow")) +
   scale_y_continuous(breaks = y_breaks) +
  theme(axis.text.y = element_text(angle = 0, hjust = 1, colour = y_labels_colors),
        legend.position = c(0.2,0.85),
        legend.title = element_blank())

In our dataset, we expect trees to be unrelated and that trees from the same family are half-siblings (with \(0.177 > r_{ij} > 0.354\)).

We look at the field data of the trees with higher \(r_{ij}\) than expected.

Code
# We load data with field and nursery information (that we merge with the updated family assignement)
field_data <- readRDS(file=here("data/ScotsPine/formatted_CG_data.rds")) %>% 
  dplyr::select(PopulationCode, FieldCode, FieldSite, Block, Nursery) %>% 
  mutate(FieldCode = as.character(FieldCode)) %>% 
  right_join(field_data, by=c("FieldCode", "PopulationCode"))
Code
# function to merge field data of the two individuals of each pair
show_fielddata <- function(r_threshold_min,r_threshold_max){
  
  subdf <-  dfplot %>% 
    filter(r>r_threshold_min) %>% 
    filter(r<r_threshold_max)
    
  lapply(c("Ind1","Ind2"), function(x) {
    subdf %>% 
      dplyr::select(any_of(x)) %>% 
      dplyr::rename(FieldCode=any_of(x)) %>% 
      left_join(field_data, by ="FieldCode") %>% 
      set_colnames(paste0(colnames(.),"_",x))
}) %>% 
    bind_cols() %>% 
    mutate(RelatednessCoeff = subdf$r)
}

We first look at pairs of individuals considered to be identical (=clones), i.e. with \(r_{ij}>0.707\).

Code
show_fielddata(r_threshold_min = KING_thresholds[[4]],
               r_threshold_max = 1.1) %>% 
    kable_mydf
FieldCode_Ind1 PopulationCode_Ind1 FieldSite_Ind1 Block_Ind1 Nursery_Ind1 Family_Ind1 FieldCode_Ind2 PopulationCode_Ind2 FieldSite_Ind2 Block_Ind2 Nursery_Ind2 Family_Ind2 RelatednessCoeff
5423 GE FE FE_A NE GE7 7575 GE FS FS_D NG GE7 0.74
5489 GT FE FE_B NE GT7 6128 GT FW FW_A NG GT7 0.78
5333 GE FE FE_B NE GE8 6061 GE FW FW_A NW GE8 0.94
5604 SO FE FE_D NE SO7 6004 SO FW FW_A NE SO7 0.95
7108 AM FS FS_B NG AM9 7632 AM FS FS_C NG AM9 0.96
5283 GA FE FE_D NE GA5 7395 GA FS FS_B NG GA5 0.98
6336 GE FW FW_B NG GE7 7575 GE FS FS_D NG GE7 0.98
5270 GA FE FE_B NE GA10 6086 GA FW FW_A NW GA10 0.99
6157 CC FW FW_A NG CC9 7281 CC FS FS_D NG CC9 1.00
6369 GC FW FW_C NE GC5 7457 GC FS FS_B NG GC5 1.02
6102 SO FW FW_A NW SO10 7419 SO FS FS_A NG SO10 1.04
5542 BB FE FE_D NE BB2 5590 BB FE FE_B NE BB2 1.04
5388 CC FE FE_A NE CC10 6342 CC FW FW_C NW CC10 1.05
5502 GL FE FE_C NE GL3 7502 GL FS FS_C NG GL3 1.06

For the eight pairs of individuals with \(r_{ij}>0.707\) (i.e., individuals considered as identical = clones), individuals come from the same families but not always the same field site and nurseries.

We then look at the 941 pairs of individuals considered to be full-siblings (i.e., with \(0.354>r_{ij}>0.707\)). We do not plot all the pairs but only those for which the two individuals do not belong to the same family.

Code
show_fielddata(r_threshold_min = KING_thresholds[[3]],
               r_threshold_max = KING_thresholds[[4]]) %>% 
  filter(!Family_Ind1 == Family_Ind2) %>% 
  kable_mydf()
FieldCode_Ind1 PopulationCode_Ind1 FieldSite_Ind1 Block_Ind1 Nursery_Ind1 Family_Ind1 FieldCode_Ind2 PopulationCode_Ind2 FieldSite_Ind2 Block_Ind2 Nursery_Ind2 Family_Ind2 RelatednessCoeff
6255 AC FW FW_B NW AC1 6404 AC FW FW_C NG AC2 0.36
5520 CC FE FE_A NE CC3 7304 CC FS FS_B NG CC1 0.36
5072 CG FE FE_A NE CG2 5218 CG FE FE_D NE CG4 0.36
7169 LC FS FS_B NG LC10 7610 LC FS FS_A NG LC1 0.37
6031 LC FW FW_A NG LC1 7169 LC FS FS_B NG LC10 0.37
5645 GE FE FE_A NE GE8 6097 GE FW FW_A NW GE3 0.37
5189 GD FE FE_D NE GD10 7459 GD FS FS_B NG GD1 0.37
5333 GE FE FE_B NE GE8 5383 GE FE FE_D NE GE2 0.41
5092 SO FE FE_D NE SO3 7364 SO FS FS_B NG SO4 0.45
7071 SO FS FS_A NG SO6 7196 SO FS FS_A NG SO5 0.50
5383 GE FE FE_D NE GE2 6061 GE FW FW_A NW GE8 0.52

5 SNPs positions on a reference genome

The files ‘snpInfo420k_PR_010623.txt’ and ‘snpInfo_PR_010623.txt’ contains genetic map locations in the P. tabuliformis genome for the PiSy420k and PiSy50k arrays, respectively. This is from Tanja Pyhajarvi’s group at the University of Helsinki, and mostly the work of Pasi Rastas.

We show the first rows of ‘snpInfo420k_PR_010623.txt’ below.

Code
map_snp_420k <- read_delim(here("data/ScotsPine/GenomicData/SNPpositions/snpInfo420k_PR_010623.txt"), 
                           delim = "\t", 
                           escape_double = FALSE, 
                           trim_ws = TRUE,
                           show_col_types = F)
map_snp_50k <- read_delim(here("data/ScotsPine/GenomicData/SNPpositions/snpInfo_PR_010623.txt"), 
                          delim = "\t", 
                          escape_double = FALSE, 
                          trim_ws = TRUE,
                          show_col_types = F)

# sapply(map_snp_420k, function(x) sum(is.na(x)))

map_snp_420k[1:10,] %>% kable_mydf(boldfirstcolumn = T, font_size = 10)
SNP LG MALE_POS FEMALE_POS TAB_CONTIG TAB_POS MAPQ IMPUTED_LG IMPUTED_MALE_POS IMPUTED_FEMALE_POS IMPUTE_POSITION
AX-117395441 1 103.15 91.14 NA NA NA NA NA NA NA
AX-117395491 1 119.60 103.85 NA NA NA NA NA NA NA
AX-117395528 5 127.00 94.86 NA NA NA NA NA NA NA
AX-117395668 8 126.36 95.60 NA NA NA NA NA NA NA
AX-117395705 9 88.16 71.69 NA NA NA NA NA NA NA
AX-117395848 3 132.25 106.81 NA NA NA NA NA NA NA
AX-117395971 8 62.07 45.53 NA NA NA NA NA NA NA
AX-117395974 1 33.62 38.14 NA NA NA NA NA NA NA
AX-117396039 11 18.74 16.43 NA NA NA NA NA NA NA
AX-117396182 10 134.63 103.92 NA NA NA NA NA NA NA

We review the number of SNPs genotyped in Scots pine and selected for further analyses in the previous sections, which can be found in these two files:

  • 35949 SNPs from the dataset with MAF were found in ‘snpInfo420k_PR_010623.txt’.

  • 32107 SNPs from the dataset without MAF were found in ‘snpInfo420k_PR_010623.txt’.

  • 35794 SNPs from the dataset with MAF were found in ‘snpInfo50k_PR_010623.txt’.

  • 32006 SNPs from the dataset without MAF were found in ‘snpInfo50k_PR_010623.txt’.

288 SNPs (from the dataset with MAF) are not in the file ‘snpInfo420k_PR_010623.txt’, and 443 are not in the file ‘snpInfo50k_PR_010623.txt’.

Below we use the ‘snpInfo420k_PR_010623.txt’ file.

We look at the number of SNPs in each linkage group (LG column) among the SNPs genotyped in Scots pine (the SNPs selected in the previous sections).

Code
lapply(list_geno,function(x){
  
  map_snp_420k %>% 
    filter(SNP %in% colnames(x[,-c(1:3)])) %>% 
    count(LG) 
  
}) %>% 
  setNames(c("Dataset with MAF","Dataset without MAF")) %>% 
  bind_rows(.id="Dataset") %>% 
  pivot_wider(names_from = "Dataset", values_from = "n") %>% 
  kable_mydf()
LG Dataset with MAF Dataset without MAF
1 2947 2704
2 2914 2704
3 2649 2488
4 2568 2407
5 2527 2366
6 2479 2313
7 2433 2277
8 2424 2249
9 2410 2268
10 2374 2197
11 2337 2157
12 2182 1994
NA 5705 3983

For each linkage group, we look at the number of SNPs assigned to each chromosome (TAB_CONTIG column). For most linkage groups, SNPs are mostly assigned to one chromosome, except for the linkage groups 2 and 3 for which the SNPs are almost equally well assigned to chromosome 1 and 3.

Code
lapply(list_geno,function(x){
  
 map_snp_420k %>% 
    filter(SNP %in% colnames(x[,-c(1:3)])) %>% 
    count(LG, TAB_CONTIG)
  
}) %>% 
  setNames(c("Dataset with MAF","Dataset without MAF")) %>% 
  bind_rows(.id="Dataset") %>% 
  pivot_wider(names_from = "Dataset", values_from = "n") %>% 
  kable_mydf()
LG TAB_CONTIG Dataset with MAF Dataset without MAF
1 chr1 2 2
1 chr10 1 1
1 chr11 1 1
1 chr12 3 2
1 chr2 8 6
1 chr3 13 13
1 chr4 2 1
1 chr5 6 4
1 chr6 2746 2523
1 chr7 2 2
1 chr8 3 3
1 tig00047773 1 1
1 tig00048640 1 1
1 tig00266601_1_2 7 5
1 tig00266603 4 4
1 tig00267508 1 1
1 tig00269545 4 3
1 tig00271547 1 1
1 tig00274049 2 1
1 tig00277291 2 1
1 tig00277298_1_2 9 9
1 tig00278606 3 1
1 NA 125 118
2 chr1 1115 1035
2 chr11 2 2
2 chr12 3 2
2 chr3 1612 1503
2 chr4 2 2
2 chr5 1 1
2 chr6 1 1
2 chr7 2 2
2 chr9 3 2
2 tig00014300 5 5
2 tig00039012 1 1
2 tig00047720 1 1
2 tig00082982 7 7
2 tig00266676 1 1
2 tig00267042_1_2 2 2
2 tig00268437 1 1
2 tig00269165_1_2 5 4
2 tig00269400 5 4
2 tig00269939 4 3
2 tig00271673 1 1
2 tig00272993 1 1
2 tig00273573 1 1
2 tig00273990_1_2 2 2
2 tig00274465 3 3
2 tig00277880_1_2 4 4
2 tig00288264 1 NA
2 tig00290873 1 1
2 tig00295179 1 1
2 NA 126 111
3 chr1 1274 1202
3 chr10 2 NA
3 chr11 1 1
3 chr12 1 1
3 chr2 1 1
3 chr3 1192 1119
3 chr4 4 4
3 chr5 4 4
3 chr7 5 5
3 chr8 3 3
3 chr9 3 3
3 tig00007272 1 1
3 tig00021188 1 1
3 tig00042787 3 3
3 tig00058951 1 1
3 tig00266635_1_2 2 2
3 tig00266652 7 7
3 tig00268140 3 3
3 tig00270633 3 3
3 tig00271447_1_2 2 2
3 tig00271564 1 1
3 tig00272805_1_2 6 5
3 tig00274921_1_2 4 4
3 tig00278952 1 1
3 tig00283502 2 2
3 tig00284211 2 2
3 tig00288197 1 1
3 tig00289947 1 1
3 tig00294850 1 1
3 NA 117 104
4 chr1 1 1
4 chr10 5 5
4 chr11 1 1
4 chr12 5 5
4 chr2 2408 2255
4 chr3 4 4
4 chr4 4 4
4 chr5 2 2
4 chr6 1 1
4 chr7 4 4
4 chr8 4 4
4 chr9 3 2
4 tig00006684 2 2
4 tig00039217 3 2
4 tig00042401 2 2
4 tig00044161 2 2
4 tig00077218 1 1
4 tig00077256 1 1
4 tig00266173_1_2 4 4
4 tig00267336 1 1
4 tig00267344 1 1
4 tig00267796 1 1
4 tig00268462 2 2
4 tig00269123 1 1
4 tig00275397 1 1
4 tig00275855 1 1
4 tig00278336 1 1
4 tig00278825 1 1
4 tig00279441 1 1
4 tig00283632 1 1
4 tig00283634 1 1
4 tig00287377 2 2
4 tig00288284 3 3
4 NA 93 87
5 chr11 2 2
5 chr12 2 2
5 chr2 6 5
5 chr3 5 5
5 chr4 5 5
5 chr5 6 4
5 chr6 2 2
5 chr7 2351 2203
5 chr8 3 3
5 chr9 2 2
5 tig00006324 1 1
5 tig00018076 3 3
5 tig00039301 3 3
5 tig00077664 2 2
5 tig00266333 2 2
5 tig00271531_1_2 4 4
5 tig00272769 1 1
5 tig00274292 2 2
5 tig00275497 1 1
5 tig00277599 1 1
5 tig00279854 2 2
5 tig00281158 3 3
5 tig00281464 1 NA
5 tig00281531 1 1
5 tig00285132 2 2
5 tig00285402 2 2
5 NA 112 103
6 chr1 7 6
6 chr10 3 3
6 chr11 3 3
6 chr2 1 1
6 chr3 4 4
6 chr4 3 2
6 chr5 2318 2167
6 chr7 2 2
6 chr8 5 4
6 chr9 1 1
6 tig00030197 2 2
6 tig00034687 3 3
6 tig00035960 3 3
6 tig00038225 1 1
6 tig00267128 1 1
6 tig00271084 3 3
6 tig00271785_1_2 6 5
6 tig00273008 1 1
6 tig00274021 1 NA
6 tig00274023 1 NA
6 tig00274324 1 1
6 tig00278253 1 1
6 tig00281824 1 1
6 tig00284930 1 1
6 tig00290150 1 1
6 tig10501301_1_2 1 NA
6 NA 104 96
7 chr1 1 1
7 chr10 2270 2126
7 chr12 4 4
7 chr2 6 5
7 chr3 4 3
7 chr4 1 1
7 chr5 2 2
7 chr6 1 1
7 chr7 1 1
7 chr8 1 1
7 chr9 4 3
7 tig00090424 4 4
7 tig00270387 1 1
7 tig00270817 11 10
7 tig00272175 1 1
7 tig00273155 3 3
7 tig00273283 5 5
7 tig00273284_1_2 1 1
7 tig00274157_1_2 3 3
7 tig00276392 2 2
7 tig00276869 1 1
7 tig00277386 1 1
7 tig00277752 1 1
7 tig00279281 1 1
7 tig00284464 1 1
7 tig00284732 2 2
7 NA 100 92
8 chr1 8 7
8 chr10 2 2
8 chr11 2 2
8 chr2 3 3
8 chr3 5 5
8 chr4 23 21
8 chr5 3 3
8 chr7 3 2
8 chr8 2 1
8 chr9 2256 2100
8 tig00001967 1 1
8 tig00008465 1 1
8 tig00027363 1 1
8 tig00042343 1 1
8 tig00059690 1 1
8 tig00269493_1_2 1 1
8 tig00270747_1_2 1 NA
8 tig00273090 1 1
8 tig00279148 1 1
8 tig00283104 1 1
8 tig00285956 1 1
8 NA 106 93
9 chr1 2 2
9 chr10 10 10
9 chr12 2 2
9 chr2 11 11
9 chr3 1 1
9 chr4 2 2
9 chr5 4 3
9 chr6 1 1
9 chr7 1 1
9 chr8 2229 2101
9 chr9 2 2
9 tig00042350 1 NA
9 tig00102113 1 1
9 tig00266967_1_2 4 2
9 tig00267961 1 1
9 tig00268080 1 1
9 tig00268161_1_2 1 1
9 tig00268361_1_2 4 4
9 tig00271053_1_2 1 1
9 tig00271183 5 5
9 tig00271604 1 1
9 tig00272358 1 1
9 tig00272805_1_2 5 4
9 tig00272806 3 3
9 tig00272923 3 3
9 tig00273233 1 1
9 tig00274530 1 1
9 tig00275016 3 3
9 tig00276193_1_2 2 2
9 tig00276303 1 1
9 tig00278099 1 1
9 tig00281147_1_2 7 7
9 tig00281636_1_2 5 5
9 tig00284223 2 2
9 tig00294966 1 NA
9 NA 89 81
10 chr1 1 1
10 chr10 2 2
10 chr11 2 2
10 chr12 2 2
10 chr2 2 1
10 chr3 2 2
10 chr4 2210 2046
10 chr5 2 2
10 chr7 1 1
10 chr8 1 1
10 chr9 1 1
10 tig00023909 1 1
10 tig00041218 10 10
10 tig00044581 1 1
10 tig00095463 1 1
10 tig00268633 2 1
10 tig00272284 2 2
10 tig00272524 2 2
10 tig00274145_1_2 2 2
10 tig00275612 1 1
10 tig00278274 2 2
10 tig00280653 1 1
10 tig00281101 1 1
10 tig00287465 2 2
10 NA 120 109
11 chr1 2 1
11 chr10 1 1
11 chr11 2141 1975
11 chr12 1 1
11 chr2 1 1
11 chr3 5 4
11 chr4 12 10
11 chr5 3 3
11 chr6 1 1
11 chr7 27 27
11 chr8 3 3
11 chr9 4 3
11 tig00038982 1 1
11 tig00072456 2 2
11 tig00267405 3 3
11 tig00268171_1_2 2 2
11 tig00269018_1_2 3 3
11 tig00270240 1 1
11 tig00270987 2 2
11 tig00272133_1_2 1 1
11 tig00274581 3 3
11 tig00278385 5 4
11 tig00284127 3 3
11 tig00284295 2 2
11 tig00285408_1_2 10 10
11 tig00286063 1 1
11 tig00287017 1 1
11 NA 96 88
12 chr1 4 4
12 chr10 5 5
12 chr11 1 1
12 chr12 2030 1854
12 chr2 3 3
12 chr3 16 15
12 chr4 1 1
12 chr5 1 1
12 chr7 3 3
12 chr9 1 1
12 tig00267749 2 2
12 tig00267970 2 2
12 tig00269650 1 NA
12 tig00273052 3 3
12 tig00274946 2 2
12 tig00277262 12 8
12 tig00278825 1 1
12 tig00280814 1 1
12 tig00285603 1 1
12 tig00291047 1 1
12 tig00291717 1 1
12 NA 90 84
NA chr1 436 301
NA chr10 427 310
NA chr11 440 334
NA chr12 407 270
NA chr2 401 279
NA chr3 532 361
NA chr4 471 317
NA chr5 454 301
NA chr6 611 434
NA chr7 479 340
NA chr8 487 342
NA chr9 470 320
NA tig00005950 1 1
NA tig00006979 1 1
NA tig00010682 1 1
NA tig00018076 1 NA
NA tig00023909 1 NA
NA tig00034687 1 1
NA tig00037617 1 1
NA tig00038225 2 1
NA tig00038982 1 1
NA tig00041218 1 1
NA tig00042401 1 NA
NA tig00042787 1 1
NA tig00058951 1 1
NA tig00060300 1 NA
NA tig00072456 2 2
NA tig00082982 1 1
NA tig00089845 1 1
NA tig00090424 1 NA
NA tig00095463 1 1
NA tig00266173_1_2 1 1
NA tig00266435 1 1
NA tig00266601_1_2 2 1
NA tig00266757_1_2 1 1
NA tig00267042_1_2 1 1
NA tig00267302 1 1
NA tig00267405 4 3
NA tig00268080 1 1
NA tig00268161_1_2 2 1
NA tig00268361_1_2 2 2
NA tig00269450 1 1
NA tig00269493_1_2 1 1
NA tig00269545 1 1
NA tig00269568_1_2 1 1
NA tig00270817 1 NA
NA tig00271522 1 1
NA tig00271726 1 1
NA tig00272133_1_2 1 1
NA tig00272749 1 1
NA tig00272805_1_2 2 2
NA tig00272806 4 NA
NA tig00272993 1 1
NA tig00273283 1 1
NA tig00273589 1 1
NA tig00274324 1 1
NA tig00274451 1 1
NA tig00274465 1 1
NA tig00274530 1 1
NA tig00274883 1 1
NA tig00276303 1 NA
NA tig00276392 2 2
NA tig00277262 1 NA
NA tig00277599 1 1
NA tig00277624 1 1
NA tig00277880_1_2 1 1
NA tig00278274 1 1
NA tig00278385 1 1
NA tig00278825 1 1
NA tig00281147_1_2 5 5
NA tig00281350 1 1
NA tig00281464 1 1
NA tig00281636_1_2 4 4
NA tig00283418 1 1
NA tig00284211 1 1
NA tig00285408_1_2 1 1
NA tig00285453 1 1
NA tig00287377 1 1
NA tig00288197 2 2
NA tig00288593 1 1
NA tig00291717 1 1

To identify SNP located nearby on the genome, we first identify SNPs on the same chromosome (based on the TAB_CONTIG column) and then determine their proximity using the TAB_POS column.

Below we plot the genome positions of the SNPs without MAF.

Code
# Formatting gen_coord for the figure
gen_coord <- map_snp_420k %>%
  filter(SNP %in% colnames(list_geno$list_withmaf[,-c(1:3)])) %>% 
  filter(TAB_CONTIG %in% paste0("chr", 1:12))
# sapply(gen_coord, function(x) sum(is.na(x))) # no NAs in the TAB_CONTIG and TAB_POS columns


gen_coord %>%
  ggplot() +
  # All SNPs of the reference genome 
  geom_segment(aes(x = TAB_POS, xend = TAB_POS, y = 0, yend = 0.2), col = "blue", size = 0.06, alpha=0.3) + 
  ylim(0,0.5) +
  facet_wrap(~TAB_CONTIG, scales = "free_x", ncol=3) +
  xlab("SNP positions on the reference genome") +
  ylab("") +
  theme_bw(base_size = 11) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "right",
    legend.background = element_blank(),
    panel.grid = element_blank(),
    legend.box.background = element_blank(),
    plot.background = element_blank(),
    panel.background = element_blank(),
    legend.text = element_text(size = rel(0.8)),
    strip.text = element_text(size = 11)
  )