#
# Script Name: rblast.R
# Purpose of script: Run Blast
# Author: Morgane Bruno
# Contact: morgane.bruno@cefe.cnrs.fr
# Licence: MIT
# Requirements: BLAST software (installation: https://github.com/mhahsler/rBLAST/blob/devel/INSTALL)
#

#_libs
if (!require("rBLAST", quietly = TRUE)) {
  BiocManager::install("rBLAST")
}
if (!require("Biostrings", quietly = TRUE)) {
  BiocManager::install("Biostrings")
}

library(stringr)
library(readr)
library(here)
library(rBLAST)
library(Biostrings)

#_input ----
## MIDORI2 "https://www.reference-midori.info/"
dl_midori_db <- TRUE
gb_version_date <- "263_2024-10-13" #required if dl_midori_db is TRUE
subdatabase <- "srRNA" #required if dl_midori_db is TRUE
## Database
db_fasta <- here::here("database", "database.fasta")
## Query
query_fasta <- here::here("data", "query_sequences.fasta")
## BLAST
min_cov <- 80
min_identity <- 60
blast_output <- here::here("blast.out")

#_function ----
download_midori_fasta <- function(
    midori_url = "https://www.reference-midori.info/download/Databases/",
    midori_gb_v_date,
    midori_subdatabase,
    out_fa,
    dl_method = "curl",
    midori_subdatabase_name = c(paste0("A", c(6,8)), paste0("CO", 1:3), paste0("ND", 1:6), 
                                paste0(c("lr", "sr"), "RNA"), "Cytb", "ND4L")
    ) {
  
  if (!midori_subdatabase %in% midori_subdatabase_name){
    stop(paste(midori_subdatabase, " doesn't exist"))
  }
  
  midori_gb_v <- stringr::word(string = midori_gb_v_date, 1, 1, sep = "_")
  dl_url <- paste0(midori_url, "GenBank", midori_gb_v_date, 
                "/RAW/total/MIDORI2_TOTAL_NUC_GB", midori_gb_v, "_", 
                midori_subdatabase, "_RAW.fasta.gz")
  
  
  dir.create(path = dirname(out_fa), showWarnings = FALSE)
  if (!file.exists(out_fa)) {
    outfa_gz <- paste0(out_fa, ".gz")
    download.file(url = dl_url, destfile = outfa_gz, method = dl_method)
    system(command = paste("gunzip", outfa_gz))
  }
}

#_main ----
## Download midori2 fasta file (genbank subdatabase) ----
if (dl_midori_db) {
  download_midori_fasta(midori_gb_v_date = gb_version_date,
                        midori_subdatabase = subdatabase,
                        out_fa = db_fasta)
}

## Make BLAST database ----
rBLAST::makeblastdb(file = db_fasta, dbtype = "nucl")
## Run BLASTN ----
bl <- rBLAST::blast(db = db_fasta)
query_seq <- Biostrings::readDNAStringSet(filepath = query_fasta)
blast_res <- predict(bl, query_seq, BLAST_args = paste("-qcov_hsp_perc", min_cov, "-perc_identity", min_identity))
readr::write_csv(x = blast_res, file = blast_output)