Skip to content
Snippets Groups Projects

rblast

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by mbruno

    Download midori2 database and run blast on R Retrieve top 1 taxonomic assignments

    Edited
    get_blast_top1.R 3.18 KiB
    #
    # Script Name: get_blast_top1.R
    # Purpose of script: Retrieve the top1 taxonomic assignments obtained with blastn for each query
    # Author: Morgane Bruno
    # Contact: morgane.bruno@cefe.cnrs.fr
    # Licence: MIT
    #
    
    #_libs
    library(tidyverse)
    
    #_input
    ## MIDORI2 "https://www.reference-midori.info/"
    dl_taxo_midori <- TRUE
    gb_version_date <- "263_2024-10-13" #required if dl_taxo_midori is TRUE
    subdatabase <- "srRNA" #required if dl_taxo_midori is TRUE
    ## Taxonomy
    taxo_file <- here::here("database", "taxonomy.csv")
    ## BLAST
    blast_file <- here::here("blast.out")
    ## TOP 1
    top1_output <- here::here("top1.csv")
    
    
    #_function ----
    download_midori_taxo <- function(
        midori_url = "https://www.reference-midori.info/download/Databases/",
        midori_gb_v_date,
        midori_subdatabase,
        out_taxo,
        dl_method = "curl",
        midori_subdatabase_name = c(paste0("A", c(6,8)), paste0("CO", 1:3), paste0("ND", 1:6),
                                    paste0(c("lr", "sr"), "RNA"), "Cytb", "ND4L")
    ) {
    
      if (!midori_subdatabase %in% midori_subdatabase_name){
        stop(paste(midori_subdatabase, " doesn't exist"))
      }
    
      midori_gb_v <- stringr::word(string = midori_gb_v_date, 1, 1, sep = "_")
      dl_url <- paste0(midori_url, "GenBank", midori_gb_v_date,
                       "/QIIME_sp/uniq/MIDORI2_UNIQ_NUC_SP_GB", midori_gb_v, "_",
                       midori_subdatabase, "_QIIME.taxon.gz")
    
    
      dir.create(path = dirname(out_taxo), showWarnings = FALSE)
      taxo <- here(dirname(out_taxo), "midori_taxonomy_raw.tsv")
      if (!file.exists(taxo)) {
        taxo_gz <- paste0(taxo, ".gz")
        download.file(url = dl_url, destfile = taxo_gz, method = dl_method)
        system(command = paste("gunzip", taxo_gz))
      }
      message("Reformate taxonomy file")
      taxo_raw <- readr::read_delim(file = taxo, delim = ";", col_names = FALSE,
                                col_select = c(1,6,7), show_col_types = FALSE) |>
        setNames(c("header", "genus", "species")) |>
        dplyr::mutate(
          id = stringr::word(string = header, start = 1 , end = 1, sep = "\t"),
          genus = stringr::str_extract(string = genus, pattern = "(?<=g__)(.*)(?=_[0-9]*)"),
          species = stringr::str_extract(string = species, pattern = "(?<=s__)(.*)(?=_[0-9]*)"),
        ) |>
        dplyr::select(-header)
      readr::write_csv(x = taxo_raw, file = out_taxo)
    }
    
    
    #_main ----
    ## Download midori2 taxonomy file  ----
    if (dl_taxo_midori) {
      download_midori_taxo (midori_gb_v_date = gb_version_date,
                            midori_subdatabase = subdatabase,
                            out_taxo = taxo_file)
    }
    
    ## BLAST TOP 1 ----
    match <- readr::read_csv(file = blast_file, show_col_types = FALSE, col_select = 1:3)
    taxo_tb <- readr::read_csv(file = taxo_file)
    
    blast_top1 <- match |>
      setNames(c("query", "blast_match", "blast_identity")) |>
      dplyr::left_join(y = taxo_tb, by = join_by(blast_match == id)) |>
      dplyr::group_by(query) |>
      dplyr::top_n(n = 1, wt = blast_identity) |>
      dplyr::ungroup() |>
      dplyr::summarise(
        blast_top1_genus = unique(genus) |> purrr::discard(is.na) |> paste0(collapse = ", "),
        blast_top1_species = unique(species) |> purrr::discard(is.na) |> paste0(collapse = ", "),
        blast_top1_identity = unique(blast_identity),
        .by = query
      )
    readr::write_csv(x = blast_top1, file = top1_output)
    rblast.R 2.50 KiB
    #
    # Script Name: rblast.R
    # Purpose of script: Run Blast
    # Author: Morgane Bruno
    # Contact: morgane.bruno@cefe.cnrs.fr
    # Licence: MIT
    # Requirements: BLAST software (installation: https://github.com/mhahsler/rBLAST/blob/devel/INSTALL)
    #
    
    #_libs
    if (!require("rBLAST", quietly = TRUE)) {
      BiocManager::install("rBLAST")
    }
    if (!require("Biostrings", quietly = TRUE)) {
      BiocManager::install("Biostrings")
    }
    
    library(stringr)
    library(readr)
    library(here)
    library(rBLAST)
    library(Biostrings)
    
    #_input ----
    ## MIDORI2 "https://www.reference-midori.info/"
    dl_midori_db <- TRUE
    gb_version_date <- "263_2024-10-13" #required if dl_midori_db is TRUE
    subdatabase <- "srRNA" #required if dl_midori_db is TRUE
    ## Database
    db_fasta <- here::here("database", "database.fasta")
    ## Query
    query_fasta <- here::here("data", "query_sequences.fasta")
    ## BLAST
    min_cov <- 80
    min_identity <- 60
    blast_output <- here::here("blast.out")
    
    #_function ----
    download_midori_fasta <- function(
        midori_url = "https://www.reference-midori.info/download/Databases/",
        midori_gb_v_date,
        midori_subdatabase,
        out_fa,
        dl_method = "curl",
        midori_subdatabase_name = c(paste0("A", c(6,8)), paste0("CO", 1:3), paste0("ND", 1:6), 
                                    paste0(c("lr", "sr"), "RNA"), "Cytb", "ND4L")
        ) {
      
      if (!midori_subdatabase %in% midori_subdatabase_name){
        stop(paste(midori_subdatabase, " doesn't exist"))
      }
      
      midori_gb_v <- stringr::word(string = midori_gb_v_date, 1, 1, sep = "_")
      dl_url <- paste0(midori_url, "GenBank", midori_gb_v_date, 
                    "/RAW/total/MIDORI2_TOTAL_NUC_GB", midori_gb_v, "_", 
                    midori_subdatabase, "_RAW.fasta.gz")
      
      
      dir.create(path = dirname(out_fa), showWarnings = FALSE)
      if (!file.exists(out_fa)) {
        outfa_gz <- paste0(out_fa, ".gz")
        download.file(url = dl_url, destfile = outfa_gz, method = dl_method)
        system(command = paste("gunzip", outfa_gz))
      }
    }
    
    #_main ----
    ## Download midori2 fasta file (genbank subdatabase) ----
    if (dl_midori_db) {
      download_midori_fasta(midori_gb_v_date = gb_version_date,
                            midori_subdatabase = subdatabase,
                            out_fa = db_fasta)
    }
    
    ## Make BLAST database ----
    rBLAST::makeblastdb(file = db_fasta, dbtype = "nucl")
    ## Run BLASTN ----
    bl <- rBLAST::blast(db = db_fasta)
    query_seq <- Biostrings::readDNAStringSet(filepath = query_fasta)
    blast_res <- predict(bl, query_seq, BLAST_args = paste("-qcov_hsp_perc", min_cov, "-perc_identity", min_identity))
    readr::write_csv(x = blast_res, file = blast_output)
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment