Scots pine genomic data

Formatting, filtering, imputation of missing data, genetic relationships

Author

Juliette Archambeau

Published

Invalid Date

1 Raw genomic data

1.1 Loading

Here are the 10 first rows and columns of the original genomic dataset.

Code

# Load the raw genomic data:
raw_data <- read_table(here("data/ScotsPine/GenomicData/data_update_Fev2025/raw_data/PineTrial_genotypes.txt"),show_col_types=F) %>%
  dplyr::select(probeset_id,sort(colnames(.)))

# viz
raw_data[1:10,1:10] %>% kable_mydf(boldfirstcolumn = T, font_size = 11)

probeset_id	5001	5002	5004	5005	5006	5007	5008	5009	5010
AX-117395395	AG	GG	AA	GG	AG	AG	AG	GG	GG
AX-117395404	GG	AG	AG	AG	AG	AG	AA	AA	AA
AX-117395409	GG	GG	GG	GG	TG	TG	GG	GG	GG
AX-117395416	AG	GG	GG	GG	GG	GG	GG	GG	GG
AX-117395425	TT	TG	TG	TG	GG	GG	TG	GG	GG
AX-117395432	AA	AA	NN	AA	AA	AA	AA	AA	AA
AX-117395439	CC	AC	CC	AC	AC	AC	AC	AC	CC
AX-117395441	CG	CG	CG	GG	GG	CC	GG	GG	CG
AX-117395444	GG	AG	GG	GG	GG	AG	GG	GG	AG
AX-117395445	GG	GG	CG	CG	CG	CG	CG	CC	GG

The original genomic dataset includes 1761 individuals (columns) and 36433 SNPs (rows).

Code

# We apply a function to each column (i.e SNP) of the raw genomic data, 
# after removing the first two columns, i.e. the clone ID and the assay in which the clone was genotyped

data <- raw_data %>% column_to_rownames(var="probeset_id") %>% t() %>% as.data.frame()

list_alleles <- lapply(colnames(data), function(x){ 
  
  vec <- data[,x]
  
  tab <- table(vec) %>%  as.matrix() %>%  t() %>% as.data.frame() %>% dplyr::select(-contains("NN"))
  
  
  if(length(colnames(tab))==1){         # monomorphic case, e.g. only A/A
    major <- names(which.max(tab))
    
    case_when(vec == major ~ 0) 
    
  } else if(length(colnames(tab))==2){  # case where there is no homozygote with the rare allele, e.g. A/A and A/G
    
    major <- names(which.max(tab))
    mid <- names(which.min(tab))
    
    case_when(vec == major ~ 0,
              vec == mid   ~ 1) 
    
  } else if (length(colnames(tab))==3){ # common case, e.g. A/A, A/G and G/G
    
    if(tab[,1]==tab[,3]){               # to account for cases where there is the same number of A/A and G/G 
      major <- colnames(tab)[1]
      minor <- colnames(tab)[3]
      } else {
    major <- names(which.max(tab[,c(1,3)]))
    minor <- names(which.min(tab[,c(1,3)]))
      }
    mid <- setdiff(names(tab),c(major,minor))  
    
    case_when(vec == major ~ 0,
              vec == minor ~ 2,
              vec == mid   ~ 1) 
  }
}) %>% setNames(colnames(data)) # re-attribute the SNP ID to list names

df <- list_alleles %>% 
  as_tibble() %>% 
  t() %>% 
  set_colnames(rownames(data)) %>% # attribute individual ID
  as.data.frame()

# save the dataset
df %>% saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/unfiltered_allele_counts.rds"))

Here are the 10 first rows and columns of the formatted dataset.

Code

rm(raw_data)
df <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/unfiltered_allele_counts.rds"))

# viz
df[1:10,1:10] %>% kable_mydf(boldfirstcolumn = T, font_size = 11)

	5001	5002	5004	5005	5006	5007	5008	5009	5010	5011
AX-117395395	1	0	2	0	1	1	1	0	0	1
AX-117395404	2	1	1	1	1	1	0	0	0	1
AX-117395409	0	0	0	0	1	1	0	0	0	0
AX-117395416	1	0	0	0	0	0	0	0	0	0
AX-117395425	2	1	1	1	0	0	1	0	0	1
AX-117395432	0	0	NA	0	0	0	0	0	0	0
AX-117395439	0	1	0	1	1	1	1	1	0	2
AX-117395441	1	1	1	0	0	2	0	0	1	0
AX-117395444	0	1	0	0	0	1	0	0	1	1
AX-117395445	0	0	1	1	1	1	1	2	0	1

2 Filtering genomic data

2.1 MAC and monomorphic SNPs

Code

# function to calculate the minor allele counts
sum_minor_allele <- function(x){df %>% 
  t() %>% 
  as_tibble() %>% 
  sapply(function(x) sum(x,na.rm=T)) %>% 
  as.data.frame() %>%
  set_colnames(c("sum")) %>% 
  rownames_to_column(var = "snp") %>% 
  as_tibble() 
}

Code

mono <- df %>% sum_minor_allele %>% filter(sum==0)
low_mac <- df %>% sum_minor_allele %>% filter(sum %in% c(1,2))

53 SNPs are monomorphic. We remove them.

143 SNPs have one or two minor allele count (MAC). We also remove these SNPs as the minor alleles may be due to PCR or sequencing errors.

Code

df <- df[!(row.names(df) %in% mono$snp),]
df <- df[!(row.names(df) %in% low_mac$snp),]

After filtering monomorphic SNPs and MAC, 36237 SNPs left.

2.2 NAs and MAF

2.2.1 Missing data per clone

Code

tab <- df %>% 
  as_tibble() %>% 
  sapply(function(x) sum(is.na(x))*100/nrow(df)) %>% 
  as.data.frame() %>%
  set_colnames(c("na_freq")) %>% 
  rownames_to_column(var = "clone") %>% 
  as_tibble() %>% 
  arrange(na_freq) %>% 
  mutate(id=1:nrow(.))

plot(tab$id,tab$na_freq,
     pch=20,
     xlab="Individual number",
     ylab="Percentage of missing data per individual")

All individuals have less than 4.33% missing data. Therefore, we do not apply a filter of missing data per individual.

2.2.2 Minor allele frequencies

We want to remove minor allele frequencies (MAF) for gene-environment association analyses, but not for population genetic structure estimation. Therefore, we produce two datasets: one with MAF and the other without.

Code

tab <- df %>% 
  t() %>% 
  as_tibble() %>% 
  sapply(function(x) sum(x,na.rm=T)*100/(2*nrow(.))) %>% 
  as.data.frame() %>%
  set_colnames(c("MAF")) %>% 
  rownames_to_column(var = "snp") %>% 
  as_tibble() %>% 
  arrange(MAF) %>% 
  mutate(id=1:nrow(.))

plot(tab$id,tab$MAF,
     pch=20,
     xlab="SNP number",
     ylab="Minor allele frequencies")

552 SNPs have MAF < 1%. 3935 SNPs have MAF < 5%.

Code

# We export the two datasets in a list.
maf_threshold <- 5
list_geno <- list(list_withmaf=list(df = df),
                  list_withoutmaf=list(df = df[!(row.names(df) %in% tab[tab$MAF<maf_threshold,]$snp),]))

In the dataset without MAF, we remove the SNPs with MAF < 5%: there are 32302 SNPs left.

2.2.3 Missing data per SNP

Code

identify_NA_perSNP <- function(x){
  
  x$df %>% 
    t() %>% 
    as_tibble() %>% 
    sapply(function(x) sum(is.na(x))*100/nrow(.)) %>% 
    as.data.frame() %>%
    set_colnames(c("na_freq")) %>% 
    rownames_to_column(var = "snp") %>% 
    as_tibble() %>% 
    arrange(na_freq) %>% 
    mutate(id=1:nrow(.))
}

NAfreq_per_SNP <- lapply(list_geno,identify_NA_perSNP)

In the dataset without MAF, the maximum percentage of missing data per SNP is 3.86%.

In the dataset with MAF, the maximum percentage of missing data per SNP is 3.86%.

Therefore, we do not apply a filter for the percentage of missing data per SNP and we keep all the SNPs.

Code

plot(NAfreq_per_SNP$list_withoutmaf$id,NAfreq_per_SNP$list_withoutmaf$na_freq,
     pch=20,
     xlab="SNP number",
     ylab="Percentage of missing data per SNP",
     main="Genomic dataset without MAF")

Code

plot(NAfreq_per_SNP$list_withmaf$id,NAfreq_per_SNP$list_withmaf$na_freq,
     pch=20,
     xlab="SNP number",
     ylab="Percentage of missing data per SNP",
     main="Genomic dataset with MAF")

After the filtering steps:

36237 SNPs and 1761 individuals in the dataset not filtered for MAF.
32302 SNPs and 1761 individuals in the dataset filtered for MAF.

2.3 Exporting the filtered datasets

When exporting the datasets, we add the population and family information.

In February 2025, the family assignment of some individuals were modified. We load the original family and populations assignments and update it with the new assignments.

Code

# Original family and population assignement
field_data <- read_excel(here("data/ScotsPine/Field.xlsx"), na = "NA") %>% 
  dplyr::select(PopulationCode,Family, FieldCode) 

# Updated family assignement
fam_rea <- read_table(here("data/ScotsPine/GenomicData/data_update_Fev2025/raw_data/FamilyReassignment.txt"),show_col_types = FALSE)

# With the code below, we see that the "family" column of "Field.xlsx" and the "OriginalFamily" column of "FamilyReassignment.txt" are the same, which is necessary.
# field_data %>% 
#   left_join(fam_rea, by="FieldCode") %>%  
#   filter(OriginalFamily != AssignedFamily | OriginalFamily != Family | AssignedFamily != Family) 

# We are going to use the "AssignedFamily" column (as advised by Annika Perry)
field_data <- field_data %>% 
  left_join(fam_rea, by="FieldCode") %>% 
  dplyr::select(-Family, -OriginalFamily) %>% 
  dplyr::rename(Family = AssignedFamily) %>% 
  mutate(FieldCode = as.character(FieldCode))

We format the two datasets with SNPs in columns and individuals in rows. And we add two columns with information about the family and the population of each individual.

We show below the first 10 rows and columns of the dataset not filtered for MAF.

Code

list_geno <- lapply(list_geno, function(x){
  
  # we invert the dataset with tidyverse (quite long)
  df <- x$df %>% 
  rownames_to_column("SNP") %>%
  pivot_longer(-SNP, names_to = "FieldCode", values_to = "Value") %>%
  pivot_wider(names_from = SNP, values_from = Value)

  # we merge the two datasets
  field_data %>% right_join(df, by="FieldCode")
  
})

list_geno$list_withmaf[1:10,1:10] %>% kable_mydf()

PopulationCode	FieldCode	Family	AX-117395395	AX-117395404	AX-117395409	AX-117395416	AX-117395425	AX-117395432	AX-117395439
GC	5001	GC5	1	2	0	1	2	0	0
GE	5002	GE9	0	1	0	0	1	0	1
RM	5004	RM7	2	1	0	0	1	NA	0
SO	5005	SO2	0	1	0	0	1	0	1
MG	5006	MG10	1	1	1	0	0	0	1
BB	5007	BB8	1	1	1	0	0	0	1
CR	5008	CR5	1	0	0	0	1	0	1
GT	5009	GT8	0	0	0	0	0	0	1
CC	5010	CC7	0	0	0	0	0	0	0
SD	5011	SD7	1	1	0	0	1	0	2

2.3.1 Allele counts

We export allele counts.

Code

# we save the dataset not filtered for MAF
saveRDS(list_geno$list_withmaf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))

# we save the dataset without MAF
saveRDS(list_geno$list_withoutmaf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds"))

2.3.2 Allele frequencies

We calculate the population allele frequencies.

Code

# Function to calculate the population allele frequencies
compute_allele_freq <- function(x){
  x %>% 
    dplyr::select(PopulationCode, contains("AX")) %>% 
    group_by(PopulationCode) %>% 
    summarise_all(~sum(., na.rm = TRUE)/((n()-sum(is.na(.)))*2))
}

Code

# For the dataset with MAF
list_geno$list_withmaf %>% 
  compute_allele_freq %>% 
  saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withMAF.rds"))

# For the dataset without MAF
list_geno$list_withoutmaf %>% 
  compute_allele_freq %>% 
  saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withoutMAF.rds"))

3 Imputation of missing data

We impute missing data based on the most common genotype within the family.

We first impute missing data in the dataset not filtered for MAF.

Code

# we load filtered genomic data not filtered for MAF
df <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))

# Function to calculate mode (i.e., most common value)
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Replace NA values with the mode within each family for each SNP
impdf <- df %>%
  group_by(Family) %>%
  mutate(across(starts_with("AX"), ~replace(., is.na(.), Mode(.[!is.na(.)])))) %>% 
  ungroup()

But, there are 8 NAs left. Here are the two families and populations they belong to.

Code

impdf %>%  
  dplyr::select(PopulationCode, FieldCode, Family, where(~ any(is.na(.)))) %>% # Select columns with NA values
  filter(if_any(everything(), is.na)) %>% # selects rows with NA values
  kable_mydf

PopulationCode	FieldCode	Family	AX-388239058	AX-388336314
CR	7008	CR8	0	NA
CR	7047	CR8	0	NA
GT	7129	GT4	NA	0
GT	7182	GT4	NA	0
CR	7441	CR8	0	NA
GT	7442	GT4	NA	0
CR	7472	CR8	0	NA
GT	7496	GT4	NA	0

We look at the counts of each allele in the two families with NAs remaining after imputation.

Code

impdf %>%  dplyr::select(PopulationCode, FieldCode, Family, where(~ any(is.na(.)))) %>% # Selects columns where there are NA values
  filter(PopulationCode %in% c("CR", "GT")) %>%
  pivot_longer(cols = starts_with("AX"), names_to = "SNP", values_to = "allele_value") %>% 
  group_by(PopulationCode, allele_value) %>% 
  summarize(allele_count = n(), .groups = "drop") %>% 
  filter(allele_value %in% c(0, 1, 2)) %>%  # Include only counts for 0, 1, and 2
  kable_mydf()

PopulationCode	allele_value	allele_count
CR	0	132
CR	1	30
CR	2	6
GT	0	129
GT	1	27
GT	2	4

Based on this, we attribute the value of 0 to the allele AX-388239058 in family GT4 and to the allele AX-388336314 in family CR8.

Code

impdf <- impdf %>%
  mutate(
    `AX-388239058` = case_when(
      Family == "GT4" ~ 0,  # Assign 0 when Family is GT4
      TRUE ~ `AX-388239058` # Retain original value otherwise
    ),
    `AX-388336314` = case_when(
      Family == "CR8" ~ 0,  # Assign 0 when Family is CR8
      TRUE ~ `AX-388336314` # Retain original value otherwise
    )
  )

# We save the dataset
saveRDS(impdf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withMAF.rds"))

We then subset the dataset by keeping only SNPs without MAF (to get an imputed dataset without MAF).

Code

impdf <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withMAF.rds"))

# Extract SNPs filtered for MAF
snps_dfnomaf <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds")) %>% 
  dplyr::select(-PopulationCode,-Family, -FieldCode) %>% 
  colnames()

# Save dataset filtered for MAF and with imputed missing data
impdf %>% 
  dplyr::select(PopulationCode, Family, FieldCode, any_of(snps_dfnomaf)) %>% 
  saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withoutMAF.rds"))

4 Genetic relatedness coefficients

We use the genomic data not filtered for MAF and with no imputation of missing data to estimate the genetic relatedness coefficients \(r_{ij}\). Assuming no inbreeding, \(r_{ij}\) can be interpreted as the proportion of the genome that is shared between individuals \(i\) and \(j\).

Code

# we load genomic data not filtered for MAF and without imputation for missing data
df <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))

# change the dataset to a matrix
mat <- df %>% 
  dplyr::select(-Family, -PopulationCode) %>% 
  column_to_rownames("FieldCode") %>% 
  as.matrix()

# We estimate the GRM with the VanRaden method
Gmat_VanRaden <- Gmatrix(SNPmatrix=mat, 
                         missingValue=NA, 
                         method="VanRaden")

# We save the G matrix
saveRDS(Gmat_VanRaden, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/Gmat_VanRaden.rds"))

We extract the genetic relatedness coefficients for each pair of individuals.

In the tutorial GWAS 5: Relatedness and population structure by Matti Pirine (University of Helsinki), the authors refer to cutting points to assign pairs of individuals to a certain degree of relatedness based on their \(r_{ij}\). These thresholds are “a commonly-used inference procedure of the KING software” and are based on the powers of \(1/2\). We will use these thresholds to assign the pairs of individuals to a given degree of relatedness (i.e., identical, 1st degree, 2nd degree, 3rd degree or unrelated).

Code

# we load the GRM
Gmat_VanRaden <- readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/Gmat_VanRaden.rds"))

# viz the matrix
Gmat_VanRaden[1:10,1:10] %>% 
  as.data.frame() %>% 
  kable_mydf(boldfirstcolumn = TRUE, round_number = 4)

	5001	5002	5004	5005	5006	5007	5008	5009	5010	5011
5001	1.0018	0.0065	0.0060	-0.0033	-0.0130	-0.0103	-0.0055	-0.0099	0.0148	0.0064
5002	0.0065	1.0056	0.0299	0.0097	-0.0249	-0.0018	-0.0072	-0.0092	-0.0014	0.0037
5004	0.0060	0.0299	1.0142	0.0001	-0.0099	0.0018	-0.0013	-0.0058	-0.0153	-0.0124
5005	-0.0033	0.0097	0.0001	1.0031	-0.0132	0.0050	0.0085	-0.0074	-0.0224	0.0124
5006	-0.0130	-0.0249	-0.0099	-0.0132	1.0103	0.0020	-0.0168	-0.0029	-0.0055	0.0018
5007	-0.0103	-0.0018	0.0018	0.0050	0.0020	1.0094	-0.0132	0.0004	-0.0045	0.0229
5008	-0.0055	-0.0072	-0.0013	0.0085	-0.0168	-0.0132	0.9954	0.0027	0.0026	-0.0086
5009	-0.0099	-0.0092	-0.0058	-0.0074	-0.0029	0.0004	0.0027	1.0081	-0.0155	-0.0230
5010	0.0148	-0.0014	-0.0153	-0.0224	-0.0055	-0.0045	0.0026	-0.0155	1.0593	-0.0063
5011	0.0064	0.0037	-0.0124	0.0124	0.0018	0.0229	-0.0086	-0.0230	-0.0063	1.0340

Code

# Function to extract the genetic relatedness coefficients in a vector
extract_relatedness_coeffs_from_GRM <- function(GRM){
# Extract upper triangle of the matrix
upper_triangle <- upper.tri(GRM, diag=TRUE)

# Set lower triangle elements to NA
GRM[!upper_triangle] <- NA

# Convert the resulting matrix to a dataframe
df <- as.data.frame(as.table(GRM))

# Remove rows with NA values (optional if you want to remove NA values)
df <- na.omit(df)

# Rename the columns
names(df) <- c("Ind1", "Ind2", "r")

df
}


# Run the function on our GRM
df_r <- extract_relatedness_coeffs_from_GRM(GRM=Gmat_VanRaden) %>% arrange(r)

# Define the thresholds to separate the degrees of relatedness
KING_thresholds <- c(0.088, 0.177, 0.354, 0.707)

# We count the number of individual pairs within each degree of relatedness 
r_counts <- df_r %>%
  filter(!Ind1==Ind2) %>% # we remove the genetic relatedness coeffs calculated by comparing the same individuals
  summarize(
    identical = sum(r > KING_thresholds[[4]]),
    first_degree = sum(r <= KING_thresholds[[4]] & r > KING_thresholds[[3]]),
    second_degree = sum(r <= KING_thresholds[[3]] & r > KING_thresholds[[2]]),
    third_degree = sum(r <= KING_thresholds[[2]] & r > KING_thresholds[[1]]),
    unrelated = sum(r <= KING_thresholds[[1]])
  )

# we prepare the dataset for the plot 
dfplot <- df_r %>% 
  filter(!Ind1==Ind2) %>% # 
  filter(r>0.05) %>% # to aid visualization and speed up plotting (it was long if I keep all the pairs)
  mutate(Index=1:nrow(.))

# Define the breaks and labels for the y-scale
y_breaks <- c(0, 0.25, 0.5, 0.75, 1, KING_thresholds)
y_labels_colors <- ifelse(y_breaks %in% KING_thresholds, "orange", "black")

# Plot
ggplot(dfplot, aes(x = Index, 
                   y = r, 
                   color = cut(r, breaks = c(-Inf, KING_thresholds, Inf),
                               labels = c(paste0("Unrelated (n = ",r_counts$unrelated,")"), 
                                          paste0("3rd degree (n = ",r_counts$third_degree,")"), 
                                          paste0("2nd degree (n = ",r_counts$second_degree,")"), 
                                          paste0("1st degree (n = ",r_counts$first_degree,")"), 
                                          paste0("Identical (n = ",r_counts$identical,")"))))) +
  geom_point() +
  geom_hline(yintercept = KING_thresholds, linetype = "dashed", color = "orange") +
  labs(x = "Pairs of individuals", y = "Genetic relatedness coefficient") +
  theme_bw() +
  scale_color_manual(values = c("blue", "green", "deeppink", "purple", "yellow")) +
   scale_y_continuous(breaks = y_breaks) +
  theme(axis.text.y = element_text(angle = 0, hjust = 1, colour = y_labels_colors),
        legend.position = c(0.2,0.85),
        legend.title = element_blank())

In our dataset, we expect trees to be unrelated and that trees from the same family are half-siblings (with \(0.177 > r_{ij} > 0.354\)).

We look at the field data of the trees with higher \(r_{ij}\) than expected.

Code

# We load data with field and nursery information (that we merge with the updated family assignement)
field_data <- readRDS(file=here("data/ScotsPine/formatted_CG_data.rds")) %>% 
  dplyr::select(PopulationCode, FieldCode, FieldSite, Block, Nursery) %>% 
  mutate(FieldCode = as.character(FieldCode)) %>% 
  right_join(field_data, by=c("FieldCode", "PopulationCode"))

Code

# function to merge field data of the two individuals of each pair
show_fielddata <- function(r_threshold_min,r_threshold_max){
  
  subdf <-  dfplot %>% 
    filter(r>r_threshold_min) %>% 
    filter(r<r_threshold_max)
    
  lapply(c("Ind1","Ind2"), function(x) {
    subdf %>% 
      dplyr::select(any_of(x)) %>% 
      dplyr::rename(FieldCode=any_of(x)) %>% 
      left_join(field_data, by ="FieldCode") %>% 
      set_colnames(paste0(colnames(.),"_",x))
}) %>% 
    bind_cols() %>% 
    mutate(RelatednessCoeff = subdf$r)
}

We first look at pairs of individuals considered to be identical (=clones), i.e. with \(r_{ij}>0.707\).

Code

show_fielddata(r_threshold_min = KING_thresholds[[4]],
               r_threshold_max = 1.1) %>% 
    kable_mydf

FieldCode_Ind1	PopulationCode_Ind1	FieldSite_Ind1	Block_Ind1	Nursery_Ind1	Family_Ind1	FieldCode_Ind2	PopulationCode_Ind2	FieldSite_Ind2	Block_Ind2	Nursery_Ind2	Family_Ind2	RelatednessCoeff
5423	GE	FE	FE_A	NE	GE7	7575	GE	FS	FS_D	NG	GE7	0.74
5489	GT	FE	FE_B	NE	GT7	6128	GT	FW	FW_A	NG	GT7	0.78
5333	GE	FE	FE_B	NE	GE8	6061	GE	FW	FW_A	NW	GE8	0.94
5604	SO	FE	FE_D	NE	SO7	6004	SO	FW	FW_A	NE	SO7	0.95
7108	AM	FS	FS_B	NG	AM9	7632	AM	FS	FS_C	NG	AM9	0.96
5283	GA	FE	FE_D	NE	GA5	7395	GA	FS	FS_B	NG	GA5	0.98
6336	GE	FW	FW_B	NG	GE7	7575	GE	FS	FS_D	NG	GE7	0.98
5270	GA	FE	FE_B	NE	GA10	6086	GA	FW	FW_A	NW	GA10	0.99
6157	CC	FW	FW_A	NG	CC9	7281	CC	FS	FS_D	NG	CC9	1.00
6369	GC	FW	FW_C	NE	GC5	7457	GC	FS	FS_B	NG	GC5	1.02
6102	SO	FW	FW_A	NW	SO10	7419	SO	FS	FS_A	NG	SO10	1.04
5542	BB	FE	FE_D	NE	BB2	5590	BB	FE	FE_B	NE	BB2	1.04
5388	CC	FE	FE_A	NE	CC10	6342	CC	FW	FW_C	NW	CC10	1.05
5502	GL	FE	FE_C	NE	GL3	7502	GL	FS	FS_C	NG	GL3	1.06

For the eight pairs of individuals with \(r_{ij}>0.707\) (i.e., individuals considered as identical = clones), individuals come from the same families but not always the same field site and nurseries.

We then look at the 941 pairs of individuals considered to be full-siblings (i.e., with \(0.354>r_{ij}>0.707\)). We do not plot all the pairs but only those for which the two individuals do not belong to the same family.

Code

show_fielddata(r_threshold_min = KING_thresholds[[3]],
               r_threshold_max = KING_thresholds[[4]]) %>% 
  filter(!Family_Ind1 == Family_Ind2) %>% 
  kable_mydf()

FieldCode_Ind1	PopulationCode_Ind1	FieldSite_Ind1	Block_Ind1	Nursery_Ind1	Family_Ind1	FieldCode_Ind2	PopulationCode_Ind2	FieldSite_Ind2	Block_Ind2	Nursery_Ind2	Family_Ind2	RelatednessCoeff
6255	AC	FW	FW_B	NW	AC1	6404	AC	FW	FW_C	NG	AC2	0.36
5520	CC	FE	FE_A	NE	CC3	7304	CC	FS	FS_B	NG	CC1	0.36
5072	CG	FE	FE_A	NE	CG2	5218	CG	FE	FE_D	NE	CG4	0.36
7169	LC	FS	FS_B	NG	LC10	7610	LC	FS	FS_A	NG	LC1	0.37
6031	LC	FW	FW_A	NG	LC1	7169	LC	FS	FS_B	NG	LC10	0.37
5645	GE	FE	FE_A	NE	GE8	6097	GE	FW	FW_A	NW	GE3	0.37
5189	GD	FE	FE_D	NE	GD10	7459	GD	FS	FS_B	NG	GD1	0.37
5333	GE	FE	FE_B	NE	GE8	5383	GE	FE	FE_D	NE	GE2	0.41
5092	SO	FE	FE_D	NE	SO3	7364	SO	FS	FS_B	NG	SO4	0.45
7071	SO	FS	FS_A	NG	SO6	7196	SO	FS	FS_A	NG	SO5	0.50
5383	GE	FE	FE_D	NE	GE2	6061	GE	FW	FW_A	NW	GE8	0.52

5 SNPs positions on a reference genome

The files ‘snpInfo420k_PR_010623.txt’ and ‘snpInfo_PR_010623.txt’ contains genetic map locations in the P. tabuliformis genome for the PiSy420k and PiSy50k arrays, respectively. This is from Tanja Pyhajarvi’s group at the University of Helsinki, and mostly the work of Pasi Rastas.

We show the first rows of ‘snpInfo420k_PR_010623.txt’ below.

Code

map_snp_420k <- read_delim(here("data/ScotsPine/GenomicData/SNPpositions/snpInfo420k_PR_010623.txt"), 
                           delim = "\t", 
                           escape_double = FALSE, 
                           trim_ws = TRUE,
                           show_col_types = F)
map_snp_50k <- read_delim(here("data/ScotsPine/GenomicData/SNPpositions/snpInfo_PR_010623.txt"), 
                          delim = "\t", 
                          escape_double = FALSE, 
                          trim_ws = TRUE,
                          show_col_types = F)

# sapply(map_snp_420k, function(x) sum(is.na(x)))

map_snp_420k[1:10,] %>% kable_mydf(boldfirstcolumn = T, font_size = 10)

SNP	LG	MALE_POS	FEMALE_POS	TAB_CONTIG	TAB_POS	MAPQ	IMPUTED_LG	IMPUTED_MALE_POS	IMPUTED_FEMALE_POS	IMPUTE_POSITION
AX-117395441	1	103.15	91.14	NA	NA	NA	NA	NA	NA	NA
AX-117395491	1	119.60	103.85	NA	NA	NA	NA	NA	NA	NA
AX-117395528	5	127.00	94.86	NA	NA	NA	NA	NA	NA	NA
AX-117395668	8	126.36	95.60	NA	NA	NA	NA	NA	NA	NA
AX-117395705	9	88.16	71.69	NA	NA	NA	NA	NA	NA	NA
AX-117395848	3	132.25	106.81	NA	NA	NA	NA	NA	NA	NA
AX-117395971	8	62.07	45.53	NA	NA	NA	NA	NA	NA	NA
AX-117395974	1	33.62	38.14	NA	NA	NA	NA	NA	NA	NA
AX-117396039	11	18.74	16.43	NA	NA	NA	NA	NA	NA	NA
AX-117396182	10	134.63	103.92	NA	NA	NA	NA	NA	NA	NA

We review the number of SNPs genotyped in Scots pine and selected for further analyses in the previous sections, which can be found in these two files:

35949 SNPs from the dataset with MAF were found in ‘snpInfo420k_PR_010623.txt’.
32107 SNPs from the dataset without MAF were found in ‘snpInfo420k_PR_010623.txt’.
35794 SNPs from the dataset with MAF were found in ‘snpInfo50k_PR_010623.txt’.
32006 SNPs from the dataset without MAF were found in ‘snpInfo50k_PR_010623.txt’.

288 SNPs (from the dataset with MAF) are not in the file ‘snpInfo420k_PR_010623.txt’, and 443 are not in the file ‘snpInfo50k_PR_010623.txt’.

Below we use the ‘snpInfo420k_PR_010623.txt’ file.

We look at the number of SNPs in each linkage group (LG column) among the SNPs genotyped in Scots pine (the SNPs selected in the previous sections).

Code

lapply(list_geno,function(x){
  
  map_snp_420k %>% 
    filter(SNP %in% colnames(x[,-c(1:3)])) %>% 
    count(LG) 
  
}) %>% 
  setNames(c("Dataset with MAF","Dataset without MAF")) %>% 
  bind_rows(.id="Dataset") %>% 
  pivot_wider(names_from = "Dataset", values_from = "n") %>% 
  kable_mydf()

LG	Dataset with MAF	Dataset without MAF
1	2947	2704
2	2914	2704
3	2649	2488
4	2568	2407
5	2527	2366
6	2479	2313
7	2433	2277
8	2424	2249
9	2410	2268
10	2374	2197
11	2337	2157
12	2182	1994
NA	5705	3983

For each linkage group, we look at the number of SNPs assigned to each chromosome (TAB_CONTIG column). For most linkage groups, SNPs are mostly assigned to one chromosome, except for the linkage groups 2 and 3 for which the SNPs are almost equally well assigned to chromosome 1 and 3.

Code

lapply(list_geno,function(x){
  
 map_snp_420k %>% 
    filter(SNP %in% colnames(x[,-c(1:3)])) %>% 
    count(LG, TAB_CONTIG)
  
}) %>% 
  setNames(c("Dataset with MAF","Dataset without MAF")) %>% 
  bind_rows(.id="Dataset") %>% 
  pivot_wider(names_from = "Dataset", values_from = "n") %>% 
  kable_mydf()

LG	TAB_CONTIG	Dataset with MAF	Dataset without MAF
1	chr1	2	2
1	chr10	1	1
1	chr11	1	1
1	chr12	3	2
1	chr2	8	6
1	chr3	13	13
1	chr4	2	1
1	chr5	6	4
1	chr6	2746	2523
1	chr7	2	2
1	chr8	3	3
1	tig00047773	1	1
1	tig00048640	1	1
1	tig00266601_1_2	7	5
1	tig00266603	4	4
1	tig00267508	1	1
1	tig00269545	4	3
1	tig00271547	1	1
1	tig00274049	2	1
1	tig00277291	2	1
1	tig00277298_1_2	9	9
1	tig00278606	3	1
1	NA	125	118
2	chr1	1115	1035
2	chr11	2	2
2	chr12	3	2
2	chr3	1612	1503
2	chr4	2	2
2	chr5	1	1
2	chr6	1	1
2	chr7	2	2
2	chr9	3	2
2	tig00014300	5	5
2	tig00039012	1	1
2	tig00047720	1	1
2	tig00082982	7	7
2	tig00266676	1	1
2	tig00267042_1_2	2	2
2	tig00268437	1	1
2	tig00269165_1_2	5	4
2	tig00269400	5	4
2	tig00269939	4	3
2	tig00271673	1	1
2	tig00272993	1	1
2	tig00273573	1	1
2	tig00273990_1_2	2	2
2	tig00274465	3	3
2	tig00277880_1_2	4	4
2	tig00288264	1	NA
2	tig00290873	1	1
2	tig00295179	1	1
2	NA	126	111
3	chr1	1274	1202
3	chr10	2	NA
3	chr11	1	1
3	chr12	1	1
3	chr2	1	1
3	chr3	1192	1119
3	chr4	4	4
3	chr5	4	4
3	chr7	5	5
3	chr8	3	3
3	chr9	3	3
3	tig00007272	1	1
3	tig00021188	1	1
3	tig00042787	3	3
3	tig00058951	1	1
3	tig00266635_1_2	2	2
3	tig00266652	7	7
3	tig00268140	3	3
3	tig00270633	3	3
3	tig00271447_1_2	2	2
3	tig00271564	1	1
3	tig00272805_1_2	6	5
3	tig00274921_1_2	4	4
3	tig00278952	1	1
3	tig00283502	2	2
3	tig00284211	2	2
3	tig00288197	1	1
3	tig00289947	1	1
3	tig00294850	1	1
3	NA	117	104
4	chr1	1	1
4	chr10	5	5
4	chr11	1	1
4	chr12	5	5
4	chr2	2408	2255
4	chr3	4	4
4	chr4	4	4
4	chr5	2	2
4	chr6	1	1
4	chr7	4	4
4	chr8	4	4
4	chr9	3	2
4	tig00006684	2	2
4	tig00039217	3	2
4	tig00042401	2	2
4	tig00044161	2	2
4	tig00077218	1	1
4	tig00077256	1	1
4	tig00266173_1_2	4	4
4	tig00267336	1	1
4	tig00267344	1	1
4	tig00267796	1	1
4	tig00268462	2	2
4	tig00269123	1	1
4	tig00275397	1	1
4	tig00275855	1	1
4	tig00278336	1	1
4	tig00278825	1	1
4	tig00279441	1	1
4	tig00283632	1	1
4	tig00283634	1	1
4	tig00287377	2	2
4	tig00288284	3	3
4	NA	93	87
5	chr11	2	2
5	chr12	2	2
5	chr2	6	5
5	chr3	5	5
5	chr4	5	5
5	chr5	6	4
5	chr6	2	2
5	chr7	2351	2203
5	chr8	3	3
5	chr9	2	2
5	tig00006324	1	1
5	tig00018076	3	3
5	tig00039301	3	3
5	tig00077664	2	2
5	tig00266333	2	2
5	tig00271531_1_2	4	4
5	tig00272769	1	1
5	tig00274292	2	2
5	tig00275497	1	1
5	tig00277599	1	1
5	tig00279854	2	2
5	tig00281158	3	3
5	tig00281464	1	NA
5	tig00281531	1	1
5	tig00285132	2	2
5	tig00285402	2	2
5	NA	112	103
6	chr1	7	6
6	chr10	3	3
6	chr11	3	3
6	chr2	1	1
6	chr3	4	4
6	chr4	3	2
6	chr5	2318	2167
6	chr7	2	2
6	chr8	5	4
6	chr9	1	1
6	tig00030197	2	2
6	tig00034687	3	3
6	tig00035960	3	3
6	tig00038225	1	1
6	tig00267128	1	1
6	tig00271084	3	3
6	tig00271785_1_2	6	5
6	tig00273008	1	1
6	tig00274021	1	NA
6	tig00274023	1	NA
6	tig00274324	1	1
6	tig00278253	1	1
6	tig00281824	1	1
6	tig00284930	1	1
6	tig00290150	1	1
6	tig10501301_1_2	1	NA
6	NA	104	96
7	chr1	1	1
7	chr10	2270	2126
7	chr12	4	4
7	chr2	6	5
7	chr3	4	3
7	chr4	1	1
7	chr5	2	2
7	chr6	1	1
7	chr7	1	1
7	chr8	1	1
7	chr9	4	3
7	tig00090424	4	4
7	tig00270387	1	1
7	tig00270817	11	10
7	tig00272175	1	1
7	tig00273155	3	3
7	tig00273283	5	5
7	tig00273284_1_2	1	1
7	tig00274157_1_2	3	3
7	tig00276392	2	2
7	tig00276869	1	1
7	tig00277386	1	1
7	tig00277752	1	1
7	tig00279281	1	1
7	tig00284464	1	1
7	tig00284732	2	2
7	NA	100	92
8	chr1	8	7
8	chr10	2	2
8	chr11	2	2
8	chr2	3	3
8	chr3	5	5
8	chr4	23	21
8	chr5	3	3
8	chr7	3	2
8	chr8	2	1
8	chr9	2256	2100
8	tig00001967	1	1
8	tig00008465	1	1
8	tig00027363	1	1
8	tig00042343	1	1
8	tig00059690	1	1
8	tig00269493_1_2	1	1
8	tig00270747_1_2	1	NA
8	tig00273090	1	1
8	tig00279148	1	1
8	tig00283104	1	1
8	tig00285956	1	1
8	NA	106	93
9	chr1	2	2
9	chr10	10	10
9	chr12	2	2
9	chr2	11	11
9	chr3	1	1
9	chr4	2	2
9	chr5	4	3
9	chr6	1	1
9	chr7	1	1
9	chr8	2229	2101
9	chr9	2	2
9	tig00042350	1	NA
9	tig00102113	1	1
9	tig00266967_1_2	4	2
9	tig00267961	1	1
9	tig00268080	1	1
9	tig00268161_1_2	1	1
9	tig00268361_1_2	4	4
9	tig00271053_1_2	1	1
9	tig00271183	5	5
9	tig00271604	1	1
9	tig00272358	1	1
9	tig00272805_1_2	5	4
9	tig00272806	3	3
9	tig00272923	3	3
9	tig00273233	1	1
9	tig00274530	1	1
9	tig00275016	3	3
9	tig00276193_1_2	2	2
9	tig00276303	1	1
9	tig00278099	1	1
9	tig00281147_1_2	7	7
9	tig00281636_1_2	5	5
9	tig00284223	2	2
9	tig00294966	1	NA
9	NA	89	81
10	chr1	1	1
10	chr10	2	2
10	chr11	2	2
10	chr12	2	2
10	chr2	2	1
10	chr3	2	2
10	chr4	2210	2046
10	chr5	2	2
10	chr7	1	1
10	chr8	1	1
10	chr9	1	1
10	tig00023909	1	1
10	tig00041218	10	10
10	tig00044581	1	1
10	tig00095463	1	1
10	tig00268633	2	1
10	tig00272284	2	2
10	tig00272524	2	2
10	tig00274145_1_2	2	2
10	tig00275612	1	1
10	tig00278274	2	2
10	tig00280653	1	1
10	tig00281101	1	1
10	tig00287465	2	2
10	NA	120	109
11	chr1	2	1
11	chr10	1	1
11	chr11	2141	1975
11	chr12	1	1
11	chr2	1	1
11	chr3	5	4
11	chr4	12	10
11	chr5	3	3
11	chr6	1	1
11	chr7	27	27
11	chr8	3	3
11	chr9	4	3
11	tig00038982	1	1
11	tig00072456	2	2
11	tig00267405	3	3
11	tig00268171_1_2	2	2
11	tig00269018_1_2	3	3
11	tig00270240	1	1
11	tig00270987	2	2
11	tig00272133_1_2	1	1
11	tig00274581	3	3
11	tig00278385	5	4
11	tig00284127	3	3
11	tig00284295	2	2
11	tig00285408_1_2	10	10
11	tig00286063	1	1
11	tig00287017	1	1
11	NA	96	88
12	chr1	4	4
12	chr10	5	5
12	chr11	1	1
12	chr12	2030	1854
12	chr2	3	3
12	chr3	16	15
12	chr4	1	1
12	chr5	1	1
12	chr7	3	3
12	chr9	1	1
12	tig00267749	2	2
12	tig00267970	2	2
12	tig00269650	1	NA
12	tig00273052	3	3
12	tig00274946	2	2
12	tig00277262	12	8
12	tig00278825	1	1
12	tig00280814	1	1
12	tig00285603	1	1
12	tig00291047	1	1
12	tig00291717	1	1
12	NA	90	84
NA	chr1	436	301
NA	chr10	427	310
NA	chr11	440	334
NA	chr12	407	270
NA	chr2	401	279
NA	chr3	532	361
NA	chr4	471	317
NA	chr5	454	301
NA	chr6	611	434
NA	chr7	479	340
NA	chr8	487	342
NA	chr9	470	320
NA	tig00005950	1	1
NA	tig00006979	1	1
NA	tig00010682	1	1
NA	tig00018076	1	NA
NA	tig00023909	1	NA
NA	tig00034687	1	1
NA	tig00037617	1	1
NA	tig00038225	2	1
NA	tig00038982	1	1
NA	tig00041218	1	1
NA	tig00042401	1	NA
NA	tig00042787	1	1
NA	tig00058951	1	1
NA	tig00060300	1	NA
NA	tig00072456	2	2
NA	tig00082982	1	1
NA	tig00089845	1	1
NA	tig00090424	1	NA
NA	tig00095463	1	1
NA	tig00266173_1_2	1	1
NA	tig00266435	1	1
NA	tig00266601_1_2	2	1
NA	tig00266757_1_2	1	1
NA	tig00267042_1_2	1	1
NA	tig00267302	1	1
NA	tig00267405	4	3
NA	tig00268080	1	1
NA	tig00268161_1_2	2	1
NA	tig00268361_1_2	2	2
NA	tig00269450	1	1
NA	tig00269493_1_2	1	1
NA	tig00269545	1	1
NA	tig00269568_1_2	1	1
NA	tig00270817	1	NA
NA	tig00271522	1	1
NA	tig00271726	1	1
NA	tig00272133_1_2	1	1
NA	tig00272749	1	1
NA	tig00272805_1_2	2	2
NA	tig00272806	4	NA
NA	tig00272993	1	1
NA	tig00273283	1	1
NA	tig00273589	1	1
NA	tig00274324	1	1
NA	tig00274451	1	1
NA	tig00274465	1	1
NA	tig00274530	1	1
NA	tig00274883	1	1
NA	tig00276303	1	NA
NA	tig00276392	2	2
NA	tig00277262	1	NA
NA	tig00277599	1	1
NA	tig00277624	1	1
NA	tig00277880_1_2	1	1
NA	tig00278274	1	1
NA	tig00278385	1	1
NA	tig00278825	1	1
NA	tig00281147_1_2	5	5
NA	tig00281350	1	1
NA	tig00281464	1	1
NA	tig00281636_1_2	4	4
NA	tig00283418	1	1
NA	tig00284211	1	1
NA	tig00285408_1_2	1	1
NA	tig00285453	1	1
NA	tig00287377	1	1
NA	tig00288197	2	2
NA	tig00288593	1	1
NA	tig00291717	1	1

To identify SNP located nearby on the genome, we first identify SNPs on the same chromosome (based on the TAB_CONTIG column) and then determine their proximity using the TAB_POS column.

Below we plot the genome positions of the SNPs without MAF.

Code

# Formatting gen_coord for the figure
gen_coord <- map_snp_420k %>%
  filter(SNP %in% colnames(list_geno$list_withmaf[,-c(1:3)])) %>% 
  filter(TAB_CONTIG %in% paste0("chr", 1:12))
# sapply(gen_coord, function(x) sum(is.na(x))) # no NAs in the TAB_CONTIG and TAB_POS columns


gen_coord %>%
  ggplot() +
  # All SNPs of the reference genome 
  geom_segment(aes(x = TAB_POS, xend = TAB_POS, y = 0, yend = 0.2), col = "blue", size = 0.06, alpha=0.3) + 
  ylim(0,0.5) +
  facet_wrap(~TAB_CONTIG, scales = "free_x", ncol=3) +
  xlab("SNP positions on the reference genome") +
  ylab("") +
  theme_bw(base_size = 11) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "right",
    legend.background = element_blank(),
    panel.grid = element_blank(),
    legend.box.background = element_blank(),
    plot.background = element_blank(),
    panel.background = element_blank(),
    legend.text = element_text(size = rel(0.8)),
    strip.text = element_text(size = 11)
  )