Formatting, filtering, imputation of missing data, genetic relationships
Author
Juliette Archambeau
Published
Invalid Date
1 Raw genomic data
1.1 Loading
Here are the 10 first rows and columns of the original genomic dataset.
Code
# Load the raw genomic data:raw_data <-read_table(here("data/ScotsPine/GenomicData/data_update_Fev2025/raw_data/PineTrial_genotypes.txt"),show_col_types=F) %>% dplyr::select(probeset_id,sort(colnames(.)))# vizraw_data[1:10,1:10] %>%kable_mydf(boldfirstcolumn = T, font_size =11)
probeset_id
5001
5002
5004
5005
5006
5007
5008
5009
5010
AX-117395395
AG
GG
AA
GG
AG
AG
AG
GG
GG
AX-117395404
GG
AG
AG
AG
AG
AG
AA
AA
AA
AX-117395409
GG
GG
GG
GG
TG
TG
GG
GG
GG
AX-117395416
AG
GG
GG
GG
GG
GG
GG
GG
GG
AX-117395425
TT
TG
TG
TG
GG
GG
TG
GG
GG
AX-117395432
AA
AA
NN
AA
AA
AA
AA
AA
AA
AX-117395439
CC
AC
CC
AC
AC
AC
AC
AC
CC
AX-117395441
CG
CG
CG
GG
GG
CC
GG
GG
CG
AX-117395444
GG
AG
GG
GG
GG
AG
GG
GG
AG
AX-117395445
GG
GG
CG
CG
CG
CG
CG
CC
GG
The original genomic dataset includes 1761 individuals (columns) and 36433 SNPs (rows).
Code
# We apply a function to each column (i.e SNP) of the raw genomic data, # after removing the first two columns, i.e. the clone ID and the assay in which the clone was genotypeddata <- raw_data %>%column_to_rownames(var="probeset_id") %>%t() %>%as.data.frame()list_alleles <-lapply(colnames(data), function(x){ vec <- data[,x] tab <-table(vec) %>%as.matrix() %>%t() %>%as.data.frame() %>% dplyr::select(-contains("NN"))if(length(colnames(tab))==1){ # monomorphic case, e.g. only A/A major <-names(which.max(tab))case_when(vec == major ~0) } elseif(length(colnames(tab))==2){ # case where there is no homozygote with the rare allele, e.g. A/A and A/G major <-names(which.max(tab)) mid <-names(which.min(tab))case_when(vec == major ~0, vec == mid ~1) } elseif (length(colnames(tab))==3){ # common case, e.g. A/A, A/G and G/Gif(tab[,1]==tab[,3]){ # to account for cases where there is the same number of A/A and G/G major <-colnames(tab)[1] minor <-colnames(tab)[3] } else { major <-names(which.max(tab[,c(1,3)])) minor <-names(which.min(tab[,c(1,3)])) } mid <-setdiff(names(tab),c(major,minor)) case_when(vec == major ~0, vec == minor ~2, vec == mid ~1) }}) %>%setNames(colnames(data)) # re-attribute the SNP ID to list namesdf <- list_alleles %>%as_tibble() %>%t() %>%set_colnames(rownames(data)) %>%# attribute individual IDas.data.frame()# save the datasetdf %>%saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/unfiltered_allele_counts.rds"))
Here are the 10 first rows and columns of the formatted dataset.
# function to calculate the minor allele countssum_minor_allele <-function(x){df %>%t() %>%as_tibble() %>%sapply(function(x) sum(x,na.rm=T)) %>%as.data.frame() %>%set_colnames(c("sum")) %>%rownames_to_column(var ="snp") %>%as_tibble() }
After filtering monomorphic SNPs and MAC, 36237 SNPs left.
2.2 NAs and MAF
2.2.1 Missing data per clone
Code
tab <- df %>%as_tibble() %>%sapply(function(x) sum(is.na(x))*100/nrow(df)) %>%as.data.frame() %>%set_colnames(c("na_freq")) %>%rownames_to_column(var ="clone") %>%as_tibble() %>%arrange(na_freq) %>%mutate(id=1:nrow(.))plot(tab$id,tab$na_freq,pch=20,xlab="Individual number",ylab="Percentage of missing data per individual")
All individuals have less than 4.33% missing data. Therefore, we do not apply a filter of missing data per individual.
2.2.2 Minor allele frequencies
We want to remove minor allele frequencies (MAF) for gene-environment association analyses, but not for population genetic structure estimation. Therefore, we produce two datasets: one with MAF and the other without.
# We export the two datasets in a list.maf_threshold <-5list_geno <-list(list_withmaf=list(df = df),list_withoutmaf=list(df = df[!(row.names(df) %in% tab[tab$MAF<maf_threshold,]$snp),]))
In the dataset without MAF, we remove the SNPs with MAF < 5%: there are 32302 SNPs left.
In the dataset without MAF, the maximum percentage of missing data per SNP is 3.86%.
In the dataset with MAF, the maximum percentage of missing data per SNP is 3.86%.
Therefore, we do not apply a filter for the percentage of missing data per SNP and we keep all the SNPs.
Code
plot(NAfreq_per_SNP$list_withoutmaf$id,NAfreq_per_SNP$list_withoutmaf$na_freq,pch=20,xlab="SNP number",ylab="Percentage of missing data per SNP",main="Genomic dataset without MAF")
Code
plot(NAfreq_per_SNP$list_withmaf$id,NAfreq_per_SNP$list_withmaf$na_freq,pch=20,xlab="SNP number",ylab="Percentage of missing data per SNP",main="Genomic dataset with MAF")
After the filtering steps:
36237 SNPs and 1761 individuals in the dataset not filtered for MAF.
32302 SNPs and 1761 individuals in the dataset filtered for MAF.
2.3 Exporting the filtered datasets
When exporting the datasets, we add the population and family information.
In February 2025, the family assignment of some individuals were modified. We load the original family and populations assignments and update it with the new assignments.
Code
# Original family and population assignementfield_data <-read_excel(here("data/ScotsPine/Field.xlsx"), na ="NA") %>% dplyr::select(PopulationCode,Family, FieldCode) # Updated family assignementfam_rea <-read_table(here("data/ScotsPine/GenomicData/data_update_Fev2025/raw_data/FamilyReassignment.txt"),show_col_types =FALSE)# With the code below, we see that the "family" column of "Field.xlsx" and the "OriginalFamily" column of "FamilyReassignment.txt" are the same, which is necessary.# field_data %>% # left_join(fam_rea, by="FieldCode") %>% # filter(OriginalFamily != AssignedFamily | OriginalFamily != Family | AssignedFamily != Family) # We are going to use the "AssignedFamily" column (as advised by Annika Perry)field_data <- field_data %>%left_join(fam_rea, by="FieldCode") %>% dplyr::select(-Family, -OriginalFamily) %>% dplyr::rename(Family = AssignedFamily) %>%mutate(FieldCode =as.character(FieldCode))
We format the two datasets with SNPs in columns and individuals in rows. And we add two columns with information about the family and the population of each individual.
We show below the first 10 rows and columns of the dataset not filtered for MAF.
Code
list_geno <-lapply(list_geno, function(x){# we invert the dataset with tidyverse (quite long) df <- x$df %>%rownames_to_column("SNP") %>%pivot_longer(-SNP, names_to ="FieldCode", values_to ="Value") %>%pivot_wider(names_from = SNP, values_from = Value)# we merge the two datasets field_data %>%right_join(df, by="FieldCode")})list_geno$list_withmaf[1:10,1:10] %>%kable_mydf()
PopulationCode
FieldCode
Family
AX-117395395
AX-117395404
AX-117395409
AX-117395416
AX-117395425
AX-117395432
AX-117395439
GC
5001
GC5
1
2
0
1
2
0
0
GE
5002
GE9
0
1
0
0
1
0
1
RM
5004
RM7
2
1
0
0
1
NA
0
SO
5005
SO2
0
1
0
0
1
0
1
MG
5006
MG10
1
1
1
0
0
0
1
BB
5007
BB8
1
1
1
0
0
0
1
CR
5008
CR5
1
0
0
0
1
0
1
GT
5009
GT8
0
0
0
0
0
0
1
CC
5010
CC7
0
0
0
0
0
0
0
SD
5011
SD7
1
1
0
0
1
0
2
2.3.1 Allele counts
We export allele counts.
Code
# we save the dataset not filtered for MAFsaveRDS(list_geno$list_withmaf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))# we save the dataset without MAFsaveRDS(list_geno$list_withoutmaf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds"))
2.3.2 Allele frequencies
We calculate the population allele frequencies.
Code
# Function to calculate the population allele frequenciescompute_allele_freq <-function(x){ x %>% dplyr::select(PopulationCode, contains("AX")) %>%group_by(PopulationCode) %>%summarise_all(~sum(., na.rm =TRUE)/((n()-sum(is.na(.)))*2))}
Code
# For the dataset with MAFlist_geno$list_withmaf %>% compute_allele_freq %>%saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withMAF.rds"))# For the dataset without MAFlist_geno$list_withoutmaf %>% compute_allele_freq %>%saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_freq_withoutMAF.rds"))
3 Imputation of missing data
We impute missing data based on the most common genotype within the family.
We first impute missing data in the dataset not filtered for MAF.
Code
# we load filtered genomic data not filtered for MAFdf <-readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))# Function to calculate mode (i.e., most common value)Mode <-function(x) { ux <-unique(x) ux[which.max(tabulate(match(x, ux)))]}# Replace NA values with the mode within each family for each SNPimpdf <- df %>%group_by(Family) %>%mutate(across(starts_with("AX"), ~replace(., is.na(.), Mode(.[!is.na(.)])))) %>%ungroup()
But, there are 8 NAs left. Here are the two families and populations they belong to.
Code
impdf %>% dplyr::select(PopulationCode, FieldCode, Family, where(~any(is.na(.)))) %>%# Select columns with NA valuesfilter(if_any(everything(), is.na)) %>%# selects rows with NA values kable_mydf
PopulationCode
FieldCode
Family
AX-388239058
AX-388336314
CR
7008
CR8
0
NA
CR
7047
CR8
0
NA
GT
7129
GT4
NA
0
GT
7182
GT4
NA
0
CR
7441
CR8
0
NA
GT
7442
GT4
NA
0
CR
7472
CR8
0
NA
GT
7496
GT4
NA
0
We look at the counts of each allele in the two families with NAs remaining after imputation.
Code
impdf %>% dplyr::select(PopulationCode, FieldCode, Family, where(~any(is.na(.)))) %>%# Selects columns where there are NA valuesfilter(PopulationCode %in%c("CR", "GT")) %>%pivot_longer(cols =starts_with("AX"), names_to ="SNP", values_to ="allele_value") %>%group_by(PopulationCode, allele_value) %>%summarize(allele_count =n(), .groups ="drop") %>%filter(allele_value %in%c(0, 1, 2)) %>%# Include only counts for 0, 1, and 2kable_mydf()
PopulationCode
allele_value
allele_count
CR
0
132
CR
1
30
CR
2
6
GT
0
129
GT
1
27
GT
2
4
Based on this, we attribute the value of 0 to the allele AX-388239058 in family GT4 and to the allele AX-388336314 in family CR8.
Code
impdf <- impdf %>%mutate(`AX-388239058`=case_when( Family =="GT4"~0, # Assign 0 when Family is GT4TRUE~`AX-388239058`# Retain original value otherwise ),`AX-388336314`=case_when( Family =="CR8"~0, # Assign 0 when Family is CR8TRUE~`AX-388336314`# Retain original value otherwise ) )# We save the datasetsaveRDS(impdf, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withMAF.rds"))
We then subset the dataset by keeping only SNPs without MAF (to get an imputed dataset without MAF).
Code
impdf <-readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withMAF.rds"))# Extract SNPs filtered for MAFsnps_dfnomaf <-readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withoutMAF.rds")) %>% dplyr::select(-PopulationCode,-Family, -FieldCode) %>%colnames()# Save dataset filtered for MAF and with imputed missing dataimpdf %>% dplyr::select(PopulationCode, Family, FieldCode, any_of(snps_dfnomaf)) %>%saveRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/imputed_allele_counts_withoutMAF.rds"))
4 Genetic relatedness coefficients
We use the genomic data not filtered for MAF and with no imputation of missing data to estimate the genetic relatedness coefficients \(r_{ij}\). Assuming no inbreeding, \(r_{ij}\) can be interpreted as the proportion of the genome that is shared between individuals \(i\) and \(j\).
Code
# we load genomic data not filtered for MAF and without imputation for missing datadf <-readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/filtered_allele_counts_withMAF.rds"))# change the dataset to a matrixmat <- df %>% dplyr::select(-Family, -PopulationCode) %>%column_to_rownames("FieldCode") %>%as.matrix()# We estimate the GRM with the VanRaden methodGmat_VanRaden <-Gmatrix(SNPmatrix=mat, missingValue=NA, method="VanRaden")# We save the G matrixsaveRDS(Gmat_VanRaden, here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/Gmat_VanRaden.rds"))
We extract the genetic relatedness coefficients for each pair of individuals.
In the tutorial GWAS 5: Relatedness and population structure by Matti Pirine (University of Helsinki), the authors refer to cutting points to assign pairs of individuals to a certain degree of relatedness based on their \(r_{ij}\). These thresholds are “a commonly-used inference procedure of the KING software” and are based on the powers of \(1/2\). We will use these thresholds to assign the pairs of individuals to a given degree of relatedness (i.e., identical, 1st degree, 2nd degree, 3rd degree or unrelated).
Code
# we load the GRMGmat_VanRaden <-readRDS(here("data/ScotsPine/GenomicData/data_update_Fev2025/formatted_data/Gmat_VanRaden.rds"))# viz the matrixGmat_VanRaden[1:10,1:10] %>%as.data.frame() %>%kable_mydf(boldfirstcolumn =TRUE, round_number =4)
5001
5002
5004
5005
5006
5007
5008
5009
5010
5011
5001
1.0018
0.0065
0.0060
-0.0033
-0.0130
-0.0103
-0.0055
-0.0099
0.0148
0.0064
5002
0.0065
1.0056
0.0299
0.0097
-0.0249
-0.0018
-0.0072
-0.0092
-0.0014
0.0037
5004
0.0060
0.0299
1.0142
0.0001
-0.0099
0.0018
-0.0013
-0.0058
-0.0153
-0.0124
5005
-0.0033
0.0097
0.0001
1.0031
-0.0132
0.0050
0.0085
-0.0074
-0.0224
0.0124
5006
-0.0130
-0.0249
-0.0099
-0.0132
1.0103
0.0020
-0.0168
-0.0029
-0.0055
0.0018
5007
-0.0103
-0.0018
0.0018
0.0050
0.0020
1.0094
-0.0132
0.0004
-0.0045
0.0229
5008
-0.0055
-0.0072
-0.0013
0.0085
-0.0168
-0.0132
0.9954
0.0027
0.0026
-0.0086
5009
-0.0099
-0.0092
-0.0058
-0.0074
-0.0029
0.0004
0.0027
1.0081
-0.0155
-0.0230
5010
0.0148
-0.0014
-0.0153
-0.0224
-0.0055
-0.0045
0.0026
-0.0155
1.0593
-0.0063
5011
0.0064
0.0037
-0.0124
0.0124
0.0018
0.0229
-0.0086
-0.0230
-0.0063
1.0340
Code
# Function to extract the genetic relatedness coefficients in a vectorextract_relatedness_coeffs_from_GRM <-function(GRM){# Extract upper triangle of the matrixupper_triangle <-upper.tri(GRM, diag=TRUE)# Set lower triangle elements to NAGRM[!upper_triangle] <-NA# Convert the resulting matrix to a dataframedf <-as.data.frame(as.table(GRM))# Remove rows with NA values (optional if you want to remove NA values)df <-na.omit(df)# Rename the columnsnames(df) <-c("Ind1", "Ind2", "r")df}# Run the function on our GRMdf_r <-extract_relatedness_coeffs_from_GRM(GRM=Gmat_VanRaden) %>%arrange(r)# Define the thresholds to separate the degrees of relatednessKING_thresholds <-c(0.088, 0.177, 0.354, 0.707)# We count the number of individual pairs within each degree of relatedness r_counts <- df_r %>%filter(!Ind1==Ind2) %>%# we remove the genetic relatedness coeffs calculated by comparing the same individualssummarize(identical =sum(r > KING_thresholds[[4]]),first_degree =sum(r <= KING_thresholds[[4]] & r > KING_thresholds[[3]]),second_degree =sum(r <= KING_thresholds[[3]] & r > KING_thresholds[[2]]),third_degree =sum(r <= KING_thresholds[[2]] & r > KING_thresholds[[1]]),unrelated =sum(r <= KING_thresholds[[1]]) )# we prepare the dataset for the plot dfplot <- df_r %>%filter(!Ind1==Ind2) %>%# filter(r>0.05) %>%# to aid visualization and speed up plotting (it was long if I keep all the pairs)mutate(Index=1:nrow(.))# Define the breaks and labels for the y-scaley_breaks <-c(0, 0.25, 0.5, 0.75, 1, KING_thresholds)y_labels_colors <-ifelse(y_breaks %in% KING_thresholds, "orange", "black")# Plotggplot(dfplot, aes(x = Index, y = r, color =cut(r, breaks =c(-Inf, KING_thresholds, Inf),labels =c(paste0("Unrelated (n = ",r_counts$unrelated,")"), paste0("3rd degree (n = ",r_counts$third_degree,")"), paste0("2nd degree (n = ",r_counts$second_degree,")"), paste0("1st degree (n = ",r_counts$first_degree,")"), paste0("Identical (n = ",r_counts$identical,")"))))) +geom_point() +geom_hline(yintercept = KING_thresholds, linetype ="dashed", color ="orange") +labs(x ="Pairs of individuals", y ="Genetic relatedness coefficient") +theme_bw() +scale_color_manual(values =c("blue", "green", "deeppink", "purple", "yellow")) +scale_y_continuous(breaks = y_breaks) +theme(axis.text.y =element_text(angle =0, hjust =1, colour = y_labels_colors),legend.position =c(0.2,0.85),legend.title =element_blank())
In our dataset, we expect trees to be unrelated and that trees from the same family are half-siblings (with \(0.177 > r_{ij} > 0.354\)).
We look at the field data of the trees with higher \(r_{ij}\) than expected.
Code
# We load data with field and nursery information (that we merge with the updated family assignement)field_data <-readRDS(file=here("data/ScotsPine/formatted_CG_data.rds")) %>% dplyr::select(PopulationCode, FieldCode, FieldSite, Block, Nursery) %>%mutate(FieldCode =as.character(FieldCode)) %>%right_join(field_data, by=c("FieldCode", "PopulationCode"))
Code
# function to merge field data of the two individuals of each pairshow_fielddata <-function(r_threshold_min,r_threshold_max){ subdf <- dfplot %>%filter(r>r_threshold_min) %>%filter(r<r_threshold_max)lapply(c("Ind1","Ind2"), function(x) { subdf %>% dplyr::select(any_of(x)) %>% dplyr::rename(FieldCode=any_of(x)) %>%left_join(field_data, by ="FieldCode") %>%set_colnames(paste0(colnames(.),"_",x))}) %>%bind_cols() %>%mutate(RelatednessCoeff = subdf$r)}
We first look at pairs of individuals considered to be identical (=clones), i.e. with \(r_{ij}>0.707\).
For the eight pairs of individuals with \(r_{ij}>0.707\) (i.e., individuals considered as identical = clones), individuals come from the same families but not always the same field site and nurseries.
We then look at the 941 pairs of individuals considered to be full-siblings (i.e., with \(0.354>r_{ij}>0.707\)). We do not plot all the pairs but only those for which the two individuals do not belong to the same family.
The files ‘snpInfo420k_PR_010623.txt’ and ‘snpInfo_PR_010623.txt’ contains genetic map locations in the P. tabuliformis genome for the PiSy420k and PiSy50k arrays, respectively. This is from Tanja Pyhajarvi’s group at the University of Helsinki, and mostly the work of Pasi Rastas.
We show the first rows of ‘snpInfo420k_PR_010623.txt’ below.
We review the number of SNPs genotyped in Scots pine and selected for further analyses in the previous sections, which can be found in these two files:
35949 SNPs from the dataset with MAF were found in ‘snpInfo420k_PR_010623.txt’.
32107 SNPs from the dataset without MAF were found in ‘snpInfo420k_PR_010623.txt’.
35794 SNPs from the dataset with MAF were found in ‘snpInfo50k_PR_010623.txt’.
32006 SNPs from the dataset without MAF were found in ‘snpInfo50k_PR_010623.txt’.
288 SNPs (from the dataset with MAF) are not in the file ‘snpInfo420k_PR_010623.txt’, and 443 are not in the file ‘snpInfo50k_PR_010623.txt’.
Below we use the ‘snpInfo420k_PR_010623.txt’ file.
We look at the number of SNPs in each linkage group (LG column) among the SNPs genotyped in Scots pine (the SNPs selected in the previous sections).
Code
lapply(list_geno,function(x){ map_snp_420k %>%filter(SNP %in%colnames(x[,-c(1:3)])) %>%count(LG) }) %>%setNames(c("Dataset with MAF","Dataset without MAF")) %>%bind_rows(.id="Dataset") %>%pivot_wider(names_from ="Dataset", values_from ="n") %>%kable_mydf()
LG
Dataset with MAF
Dataset without MAF
1
2947
2704
2
2914
2704
3
2649
2488
4
2568
2407
5
2527
2366
6
2479
2313
7
2433
2277
8
2424
2249
9
2410
2268
10
2374
2197
11
2337
2157
12
2182
1994
NA
5705
3983
For each linkage group, we look at the number of SNPs assigned to each chromosome (TAB_CONTIG column). For most linkage groups, SNPs are mostly assigned to one chromosome, except for the linkage groups 2 and 3 for which the SNPs are almost equally well assigned to chromosome 1 and 3.
Code
lapply(list_geno,function(x){ map_snp_420k %>%filter(SNP %in%colnames(x[,-c(1:3)])) %>%count(LG, TAB_CONTIG)}) %>%setNames(c("Dataset with MAF","Dataset without MAF")) %>%bind_rows(.id="Dataset") %>%pivot_wider(names_from ="Dataset", values_from ="n") %>%kable_mydf()
LG
TAB_CONTIG
Dataset with MAF
Dataset without MAF
1
chr1
2
2
1
chr10
1
1
1
chr11
1
1
1
chr12
3
2
1
chr2
8
6
1
chr3
13
13
1
chr4
2
1
1
chr5
6
4
1
chr6
2746
2523
1
chr7
2
2
1
chr8
3
3
1
tig00047773
1
1
1
tig00048640
1
1
1
tig00266601_1_2
7
5
1
tig00266603
4
4
1
tig00267508
1
1
1
tig00269545
4
3
1
tig00271547
1
1
1
tig00274049
2
1
1
tig00277291
2
1
1
tig00277298_1_2
9
9
1
tig00278606
3
1
1
NA
125
118
2
chr1
1115
1035
2
chr11
2
2
2
chr12
3
2
2
chr3
1612
1503
2
chr4
2
2
2
chr5
1
1
2
chr6
1
1
2
chr7
2
2
2
chr9
3
2
2
tig00014300
5
5
2
tig00039012
1
1
2
tig00047720
1
1
2
tig00082982
7
7
2
tig00266676
1
1
2
tig00267042_1_2
2
2
2
tig00268437
1
1
2
tig00269165_1_2
5
4
2
tig00269400
5
4
2
tig00269939
4
3
2
tig00271673
1
1
2
tig00272993
1
1
2
tig00273573
1
1
2
tig00273990_1_2
2
2
2
tig00274465
3
3
2
tig00277880_1_2
4
4
2
tig00288264
1
NA
2
tig00290873
1
1
2
tig00295179
1
1
2
NA
126
111
3
chr1
1274
1202
3
chr10
2
NA
3
chr11
1
1
3
chr12
1
1
3
chr2
1
1
3
chr3
1192
1119
3
chr4
4
4
3
chr5
4
4
3
chr7
5
5
3
chr8
3
3
3
chr9
3
3
3
tig00007272
1
1
3
tig00021188
1
1
3
tig00042787
3
3
3
tig00058951
1
1
3
tig00266635_1_2
2
2
3
tig00266652
7
7
3
tig00268140
3
3
3
tig00270633
3
3
3
tig00271447_1_2
2
2
3
tig00271564
1
1
3
tig00272805_1_2
6
5
3
tig00274921_1_2
4
4
3
tig00278952
1
1
3
tig00283502
2
2
3
tig00284211
2
2
3
tig00288197
1
1
3
tig00289947
1
1
3
tig00294850
1
1
3
NA
117
104
4
chr1
1
1
4
chr10
5
5
4
chr11
1
1
4
chr12
5
5
4
chr2
2408
2255
4
chr3
4
4
4
chr4
4
4
4
chr5
2
2
4
chr6
1
1
4
chr7
4
4
4
chr8
4
4
4
chr9
3
2
4
tig00006684
2
2
4
tig00039217
3
2
4
tig00042401
2
2
4
tig00044161
2
2
4
tig00077218
1
1
4
tig00077256
1
1
4
tig00266173_1_2
4
4
4
tig00267336
1
1
4
tig00267344
1
1
4
tig00267796
1
1
4
tig00268462
2
2
4
tig00269123
1
1
4
tig00275397
1
1
4
tig00275855
1
1
4
tig00278336
1
1
4
tig00278825
1
1
4
tig00279441
1
1
4
tig00283632
1
1
4
tig00283634
1
1
4
tig00287377
2
2
4
tig00288284
3
3
4
NA
93
87
5
chr11
2
2
5
chr12
2
2
5
chr2
6
5
5
chr3
5
5
5
chr4
5
5
5
chr5
6
4
5
chr6
2
2
5
chr7
2351
2203
5
chr8
3
3
5
chr9
2
2
5
tig00006324
1
1
5
tig00018076
3
3
5
tig00039301
3
3
5
tig00077664
2
2
5
tig00266333
2
2
5
tig00271531_1_2
4
4
5
tig00272769
1
1
5
tig00274292
2
2
5
tig00275497
1
1
5
tig00277599
1
1
5
tig00279854
2
2
5
tig00281158
3
3
5
tig00281464
1
NA
5
tig00281531
1
1
5
tig00285132
2
2
5
tig00285402
2
2
5
NA
112
103
6
chr1
7
6
6
chr10
3
3
6
chr11
3
3
6
chr2
1
1
6
chr3
4
4
6
chr4
3
2
6
chr5
2318
2167
6
chr7
2
2
6
chr8
5
4
6
chr9
1
1
6
tig00030197
2
2
6
tig00034687
3
3
6
tig00035960
3
3
6
tig00038225
1
1
6
tig00267128
1
1
6
tig00271084
3
3
6
tig00271785_1_2
6
5
6
tig00273008
1
1
6
tig00274021
1
NA
6
tig00274023
1
NA
6
tig00274324
1
1
6
tig00278253
1
1
6
tig00281824
1
1
6
tig00284930
1
1
6
tig00290150
1
1
6
tig10501301_1_2
1
NA
6
NA
104
96
7
chr1
1
1
7
chr10
2270
2126
7
chr12
4
4
7
chr2
6
5
7
chr3
4
3
7
chr4
1
1
7
chr5
2
2
7
chr6
1
1
7
chr7
1
1
7
chr8
1
1
7
chr9
4
3
7
tig00090424
4
4
7
tig00270387
1
1
7
tig00270817
11
10
7
tig00272175
1
1
7
tig00273155
3
3
7
tig00273283
5
5
7
tig00273284_1_2
1
1
7
tig00274157_1_2
3
3
7
tig00276392
2
2
7
tig00276869
1
1
7
tig00277386
1
1
7
tig00277752
1
1
7
tig00279281
1
1
7
tig00284464
1
1
7
tig00284732
2
2
7
NA
100
92
8
chr1
8
7
8
chr10
2
2
8
chr11
2
2
8
chr2
3
3
8
chr3
5
5
8
chr4
23
21
8
chr5
3
3
8
chr7
3
2
8
chr8
2
1
8
chr9
2256
2100
8
tig00001967
1
1
8
tig00008465
1
1
8
tig00027363
1
1
8
tig00042343
1
1
8
tig00059690
1
1
8
tig00269493_1_2
1
1
8
tig00270747_1_2
1
NA
8
tig00273090
1
1
8
tig00279148
1
1
8
tig00283104
1
1
8
tig00285956
1
1
8
NA
106
93
9
chr1
2
2
9
chr10
10
10
9
chr12
2
2
9
chr2
11
11
9
chr3
1
1
9
chr4
2
2
9
chr5
4
3
9
chr6
1
1
9
chr7
1
1
9
chr8
2229
2101
9
chr9
2
2
9
tig00042350
1
NA
9
tig00102113
1
1
9
tig00266967_1_2
4
2
9
tig00267961
1
1
9
tig00268080
1
1
9
tig00268161_1_2
1
1
9
tig00268361_1_2
4
4
9
tig00271053_1_2
1
1
9
tig00271183
5
5
9
tig00271604
1
1
9
tig00272358
1
1
9
tig00272805_1_2
5
4
9
tig00272806
3
3
9
tig00272923
3
3
9
tig00273233
1
1
9
tig00274530
1
1
9
tig00275016
3
3
9
tig00276193_1_2
2
2
9
tig00276303
1
1
9
tig00278099
1
1
9
tig00281147_1_2
7
7
9
tig00281636_1_2
5
5
9
tig00284223
2
2
9
tig00294966
1
NA
9
NA
89
81
10
chr1
1
1
10
chr10
2
2
10
chr11
2
2
10
chr12
2
2
10
chr2
2
1
10
chr3
2
2
10
chr4
2210
2046
10
chr5
2
2
10
chr7
1
1
10
chr8
1
1
10
chr9
1
1
10
tig00023909
1
1
10
tig00041218
10
10
10
tig00044581
1
1
10
tig00095463
1
1
10
tig00268633
2
1
10
tig00272284
2
2
10
tig00272524
2
2
10
tig00274145_1_2
2
2
10
tig00275612
1
1
10
tig00278274
2
2
10
tig00280653
1
1
10
tig00281101
1
1
10
tig00287465
2
2
10
NA
120
109
11
chr1
2
1
11
chr10
1
1
11
chr11
2141
1975
11
chr12
1
1
11
chr2
1
1
11
chr3
5
4
11
chr4
12
10
11
chr5
3
3
11
chr6
1
1
11
chr7
27
27
11
chr8
3
3
11
chr9
4
3
11
tig00038982
1
1
11
tig00072456
2
2
11
tig00267405
3
3
11
tig00268171_1_2
2
2
11
tig00269018_1_2
3
3
11
tig00270240
1
1
11
tig00270987
2
2
11
tig00272133_1_2
1
1
11
tig00274581
3
3
11
tig00278385
5
4
11
tig00284127
3
3
11
tig00284295
2
2
11
tig00285408_1_2
10
10
11
tig00286063
1
1
11
tig00287017
1
1
11
NA
96
88
12
chr1
4
4
12
chr10
5
5
12
chr11
1
1
12
chr12
2030
1854
12
chr2
3
3
12
chr3
16
15
12
chr4
1
1
12
chr5
1
1
12
chr7
3
3
12
chr9
1
1
12
tig00267749
2
2
12
tig00267970
2
2
12
tig00269650
1
NA
12
tig00273052
3
3
12
tig00274946
2
2
12
tig00277262
12
8
12
tig00278825
1
1
12
tig00280814
1
1
12
tig00285603
1
1
12
tig00291047
1
1
12
tig00291717
1
1
12
NA
90
84
NA
chr1
436
301
NA
chr10
427
310
NA
chr11
440
334
NA
chr12
407
270
NA
chr2
401
279
NA
chr3
532
361
NA
chr4
471
317
NA
chr5
454
301
NA
chr6
611
434
NA
chr7
479
340
NA
chr8
487
342
NA
chr9
470
320
NA
tig00005950
1
1
NA
tig00006979
1
1
NA
tig00010682
1
1
NA
tig00018076
1
NA
NA
tig00023909
1
NA
NA
tig00034687
1
1
NA
tig00037617
1
1
NA
tig00038225
2
1
NA
tig00038982
1
1
NA
tig00041218
1
1
NA
tig00042401
1
NA
NA
tig00042787
1
1
NA
tig00058951
1
1
NA
tig00060300
1
NA
NA
tig00072456
2
2
NA
tig00082982
1
1
NA
tig00089845
1
1
NA
tig00090424
1
NA
NA
tig00095463
1
1
NA
tig00266173_1_2
1
1
NA
tig00266435
1
1
NA
tig00266601_1_2
2
1
NA
tig00266757_1_2
1
1
NA
tig00267042_1_2
1
1
NA
tig00267302
1
1
NA
tig00267405
4
3
NA
tig00268080
1
1
NA
tig00268161_1_2
2
1
NA
tig00268361_1_2
2
2
NA
tig00269450
1
1
NA
tig00269493_1_2
1
1
NA
tig00269545
1
1
NA
tig00269568_1_2
1
1
NA
tig00270817
1
NA
NA
tig00271522
1
1
NA
tig00271726
1
1
NA
tig00272133_1_2
1
1
NA
tig00272749
1
1
NA
tig00272805_1_2
2
2
NA
tig00272806
4
NA
NA
tig00272993
1
1
NA
tig00273283
1
1
NA
tig00273589
1
1
NA
tig00274324
1
1
NA
tig00274451
1
1
NA
tig00274465
1
1
NA
tig00274530
1
1
NA
tig00274883
1
1
NA
tig00276303
1
NA
NA
tig00276392
2
2
NA
tig00277262
1
NA
NA
tig00277599
1
1
NA
tig00277624
1
1
NA
tig00277880_1_2
1
1
NA
tig00278274
1
1
NA
tig00278385
1
1
NA
tig00278825
1
1
NA
tig00281147_1_2
5
5
NA
tig00281350
1
1
NA
tig00281464
1
1
NA
tig00281636_1_2
4
4
NA
tig00283418
1
1
NA
tig00284211
1
1
NA
tig00285408_1_2
1
1
NA
tig00285453
1
1
NA
tig00287377
1
1
NA
tig00288197
2
2
NA
tig00288593
1
1
NA
tig00291717
1
1
To identify SNP located nearby on the genome, we first identify SNPs on the same chromosome (based on the TAB_CONTIG column) and then determine their proximity using the TAB_POS column.
Below we plot the genome positions of the SNPs without MAF.
Code
# Formatting gen_coord for the figuregen_coord <- map_snp_420k %>%filter(SNP %in%colnames(list_geno$list_withmaf[,-c(1:3)])) %>%filter(TAB_CONTIG %in%paste0("chr", 1:12))# sapply(gen_coord, function(x) sum(is.na(x))) # no NAs in the TAB_CONTIG and TAB_POS columnsgen_coord %>%ggplot() +# All SNPs of the reference genome geom_segment(aes(x = TAB_POS, xend = TAB_POS, y =0, yend =0.2), col ="blue", size =0.06, alpha=0.3) +ylim(0,0.5) +facet_wrap(~TAB_CONTIG, scales ="free_x", ncol=3) +xlab("SNP positions on the reference genome") +ylab("") +theme_bw(base_size =11) +theme(axis.text.y =element_blank(),axis.ticks.y =element_blank(),legend.position ="right",legend.background =element_blank(),panel.grid =element_blank(),legend.box.background =element_blank(),plot.background =element_blank(),panel.background =element_blank(),legend.text =element_text(size =rel(0.8)),strip.text =element_text(size =11) )