Last updated: 2021-10-01

Checks: 7 0

Knit directory: fitnessGWAS/

library( # install via source(""); biocLite("")
options(future.globals.maxSize = 2000 * 1024 ^ 2, 
        stringsAsFactors = FALSE)

# Helper function to split a vector into chunks 
chunker <- function(x, max_chunk_size) split(x, ceiling(seq_along(x) / max_chunk_size))

# database of D. mel annotations from bioconductor
con <- dbconn(

Get the annotations for each DGRP variant

The following function temporarily loads the >1GB annotation file provided on the DGRP website at We then extract the following variables for each variant, and save them in a SQLite database for memory-efficient searching inside R:

  • The Flybase ID(s), if the variant is within or close to one or more genes
  • The site class of the variant (e.g. intron, 5’-UTR…)
  • The distance-to-gene in nucleotides (for UPSTREAM and DOWNSTREAM variants only)
get_variant_annotations <- function(){
  # Load up the big annotation file, get pertinent info. It's stored in some sort of text string format
  annot <- read.table("data/input/dgrp.fb557.annot.txt", header = FALSE, stringsAsFactors = FALSE) <- function(rows){
    lapply(rows, function(row){
      site.class.field <- strsplit(annot$V3[row], split = "]")[[1]][1]
      num.genes <- str_count(site.class.field, ";") + 1
      output <- cbind(rep(annot$V1[row], num.genes), 
            "rbind", lapply(strsplit(site.class.field, split = ";")[[1]], 
                                              function(x) strsplit(x, split = "[|]")[[1]])))
      if(ncol(output) == 5) return(output[,c(1,2,4,5)]) # only return SNPs that have some annotation. Don't get the gene symbol
      else return(NULL)
    }) %>%"rbind", .)
  variant.details <- future_lapply(chunker(1:nrow(annot), max_chunk_size = 10000), %>%"rbind", .) %>%
  names(variant.details) <- c("SNP", "FBID", "site.class", "")
  variant.details$FBID <- unlist(str_extract_all(variant.details$FBID, "FBgn[:digit:]+")) # clean up text strings for Flybase ID
  variant.details %>%
    dplyr::filter(site.class != "FBgn0003638") %>% # NB this is a bug in the DGRP's annotation file
    mutate(chr = str_remove_all(substr(SNP, 1, 2), "_")) # get chromosome now for faster sorting later

Get the annotations for each Drosophila gene

The following function gets the annotations for the all the genes covered by DGRP variants, from the database object from Bioconductor. I don’t like the select interface to those objects (it messes with any R code that uses dplyr), so here I save the info into the SQLite database for later access.

Table to convert among gene IDs and names

get_gene_annotations <- function(){
  tbl(con, "genes") %>%
    left_join(tbl(con, "flybase"), by = "_id") %>%
    left_join(tbl(con, "gene_info"), by = "_id") %>% 
    left_join(tbl(con, "chromosomes"), by = "_id") %>%
    dplyr::select(flybase_id, gene_name, symbol, gene_id, chromosome) %>%
    dplyr::rename(FBID = flybase_id, gene_symbol = symbol, entrez_id = gene_id) %>%
    collect(n = Inf)

KEGG annotations

get_KEGG <- function(){
  tbl(dbconn(, "kegg") %>%
    left_join(tbl(con, "flybase"), by = "_id") %>% 
    dplyr::select(flybase_id, path_id) %>%
    dplyr::rename(FBID = flybase_id, kegg_id = path_id) %>%
    collect(n = Inf)

GO annotations

get_GO <- function(){
  tbl(dbconn(, "go_all") %>%
    left_join(tbl(con, "flybase"), by = "_id") %>% 
    dplyr::select(flybase_id, go_id, ontology) %>%
    dplyr::rename(FBID = flybase_id) %>%
    collect(n = Inf)
GO <- get_GO()

GO term meanings

go_meanings <- suppressMessages(
                        GO$go_id, c("GOID", "ONTOLOGY", "TERM")))
names(go_meanings) <- c("GO", "ontology", "term")
go_meanings <- distinct(go_meanings)

Create the SQLite database and add various tables of annotations

if(file.exists("data/derived/annotations.sqlite3")) unlink("data/derived/annotations.sqlite3")

db <- DBI::dbConnect(RSQLite::SQLite(), "data/derived/annotations.sqlite3", create = TRUE)

db %>% copy_to(get_variant_annotations(), 
               "variants", temporary = FALSE, 
               indexes = list("SNP", "FBID", "chr", "site.class")) 

db %>% copy_to(get_gene_annotations(), 
               "genes", temporary = FALSE)

db %>% copy_to(GO, "GO", temporary = FALSE)

db %>% copy_to(get_KEGG(), 
               "KEGG", temporary = FALSE)

db %>% copy_to(go_meanings, 
               "go_meanings", temporary = FALSE)

View the table of variant annotations

The variants table is expanded upon in the script perform_gwas.Rmd, which also adds the minor allele frequencies, the alleles that were treated as the reference and alternate, etc.

db <- DBI::dbConnect(RSQLite::SQLite(), "data/derived/annotations.sqlite3")
db %>% tbl("variants")
# Source:   table<variants> [?? x 9]
# Database: sqlite 3.30.1
#   [/Users/lholman/Rprojects/fitnessGWAS/data/derived/annotations.sqlite3]
   SNP   FBID  site.class chr   position   MAF minor_allele
   <chr> <chr> <chr>      <chr>            <chr> <chr>    <dbl> <chr>       
 1 2L_1… FBgn… NON_SYNON… 0                2L    10000016 0.463 C           
 2 2L_1… FBgn… INTRON     0                2L    10000016 0.463 C           
 3 2L_1… FBgn… SYNONYMOU… 0                2L    10000033 0.483 G           
 4 2L_1… FBgn… INTRON     0                2L    10000033 0.483 G           
 5 2L_1… FBgn… INTRON     0                2L    10000089 0.429 C           
 6 2L_1… FBgn… NON_SYNON… 0                2L    10000089 0.429 C           
 7 2L_1… FBgn… INTRON     0                2L    10000135 0.478 A           
 8 2L_1… FBgn… NON_SYNON… 0                2L    10000135 0.478 A           
 9 2L_1… FBgn… NON_SYNON… 0                2L    10000234 0.4   C           
10 2L_1… FBgn… INTRON     0                2L    10000234 0.4   C           
# … with more rows, and 1 more variable: major_allele <chr>

View the table of gene annotations

db %>% tbl("genes")
# Source:   table<genes> [?? x 5]
# Database: sqlite 3.30.1
#   [/Users/lholman/Rprojects/fitnessGWAS/data/derived/annotations.sqlite3]
   FBID        gene_name               gene_symbol entrez_id chromosome
   <chr>       <chr>                   <chr>       <chr>     <chr>     
 1 FBgn0040373 uncharacterized protein CG3038      30970     X         
 2 FBgn0040372 G9a                     G9a         30971     X         
 3 FBgn0261446 uncharacterized protein CG13377     30972     X         
 4 FBgn0000316 cinnamon                cin         30973     X         
 5 FBgn0005427 erect wing              ewg         30975     X         
 6 FBgn0040370 uncharacterized protein CG13375     30976     X         
 7 FBgn0040371 uncharacterized protein CG12470     30977     X         
 8 FBgn0029521 Odorant receptor 1a     Or1a        30978     X         
 9 FBgn0024989 uncharacterized protein CG3777      30979     X         
10 FBgn0004034 yellow                  y           30980     X         
# … with more rows

R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] GO.db_3.11.4  AnnotationDbi_1.50.0
 [4] IRanges_2.22.2       S4Vectors_0.26.1     Biobase_2.48.0      
 [7] BiocGenerics_0.34.0  future.apply_1.5.0   future_1.17.0       
[10] stringr_1.4.0        dplyr_1.0.0          workflowr_1.6.2     

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6     dbplyr_1.4.4     compiler_4.0.3   pillar_1.4.4    
 [5] later_1.0.0      git2r_0.27.1     tools_4.0.3      bit_1.1-15.2    
 [9] digest_0.6.25    memoise_1.1.0    RSQLite_2.2.0    evaluate_0.14   
[13] lifecycle_0.2.0  tibble_3.0.1     pkgconfig_2.0.3  rlang_0.4.6     
[17] cli_2.0.2        DBI_1.1.0        yaml_2.2.1       xfun_0.22       
[21] knitr_1.32       generics_0.0.2   fs_1.4.1         vctrs_0.3.0     
[25] globals_0.12.5   bit64_0.9-7      rprojroot_1.3-2  tidyselect_1.1.0
[29] glue_1.4.2       listenv_0.8.0    R6_2.4.1         fansi_0.4.1     
[33] rmarkdown_2.5    blob_1.2.1       purrr_0.3.4      magrittr_2.0.1  
[37] whisker_0.4      backports_1.1.7  promises_1.1.0   codetools_0.2-16
[41] htmltools_0.5.0  ellipsis_0.3.1   assertthat_0.2.1 httpuv_1.5.3.1  
[45] utf8_1.1.4       stringi_1.5.3    crayon_1.3.4