rblast
The snippet can be accessed without any authentication.
Authored by
mbruno
Download midori2 database and run blast on R Retrieve top 1 taxonomic assignments
get_blast_top1.R 3.18 KiB
#
# Script Name: get_blast_top1.R
# Purpose of script: Retrieve the top1 taxonomic assignments obtained with blastn for each query
# Author: Morgane Bruno
# Contact: morgane.bruno@cefe.cnrs.fr
# Licence: MIT
#
#_libs
library(tidyverse)
#_input
## MIDORI2 "https://www.reference-midori.info/"
dl_taxo_midori <- TRUE
gb_version_date <- "263_2024-10-13" #required if dl_taxo_midori is TRUE
subdatabase <- "srRNA" #required if dl_taxo_midori is TRUE
## Taxonomy
taxo_file <- here::here("database", "taxonomy.csv")
## BLAST
blast_file <- here::here("blast.out")
## TOP 1
top1_output <- here::here("top1.csv")
#_function ----
download_midori_taxo <- function(
midori_url = "https://www.reference-midori.info/download/Databases/",
midori_gb_v_date,
midori_subdatabase,
out_taxo,
dl_method = "curl",
midori_subdatabase_name = c(paste0("A", c(6,8)), paste0("CO", 1:3), paste0("ND", 1:6),
paste0(c("lr", "sr"), "RNA"), "Cytb", "ND4L")
) {
if (!midori_subdatabase %in% midori_subdatabase_name){
stop(paste(midori_subdatabase, " doesn't exist"))
}
midori_gb_v <- stringr::word(string = midori_gb_v_date, 1, 1, sep = "_")
dl_url <- paste0(midori_url, "GenBank", midori_gb_v_date,
"/QIIME_sp/uniq/MIDORI2_UNIQ_NUC_SP_GB", midori_gb_v, "_",
midori_subdatabase, "_QIIME.taxon.gz")
dir.create(path = dirname(out_taxo), showWarnings = FALSE)
taxo <- here(dirname(out_taxo), "midori_taxonomy_raw.tsv")
if (!file.exists(taxo)) {
taxo_gz <- paste0(taxo, ".gz")
download.file(url = dl_url, destfile = taxo_gz, method = dl_method)
system(command = paste("gunzip", taxo_gz))
}
message("Reformate taxonomy file")
taxo_raw <- readr::read_delim(file = taxo, delim = ";", col_names = FALSE,
col_select = c(1,6,7), show_col_types = FALSE) |>
setNames(c("header", "genus", "species")) |>
dplyr::mutate(
id = stringr::word(string = header, start = 1 , end = 1, sep = "\t"),
genus = stringr::str_extract(string = genus, pattern = "(?<=g__)(.*)(?=_[0-9]*)"),
species = stringr::str_extract(string = species, pattern = "(?<=s__)(.*)(?=_[0-9]*)"),
) |>
dplyr::select(-header)
readr::write_csv(x = taxo_raw, file = out_taxo)
}
#_main ----
## Download midori2 taxonomy file ----
if (dl_taxo_midori) {
download_midori_taxo (midori_gb_v_date = gb_version_date,
midori_subdatabase = subdatabase,
out_taxo = taxo_file)
}
## BLAST TOP 1 ----
match <- readr::read_csv(file = blast_file, show_col_types = FALSE, col_select = 1:3)
taxo_tb <- readr::read_csv(file = taxo_file)
blast_top1 <- match |>
setNames(c("query", "blast_match", "blast_identity")) |>
dplyr::left_join(y = taxo_tb, by = join_by(blast_match == id)) |>
dplyr::group_by(query) |>
dplyr::top_n(n = 1, wt = blast_identity) |>
dplyr::ungroup() |>
dplyr::summarise(
blast_top1_genus = unique(genus) |> purrr::discard(is.na) |> paste0(collapse = ", "),
blast_top1_species = unique(species) |> purrr::discard(is.na) |> paste0(collapse = ", "),
blast_top1_identity = unique(blast_identity),
.by = query
)
readr::write_csv(x = blast_top1, file = top1_output)
rblast.R 2.50 KiB
#
# Script Name: rblast.R
# Purpose of script: Run Blast
# Author: Morgane Bruno
# Contact: morgane.bruno@cefe.cnrs.fr
# Licence: MIT
# Requirements: BLAST software (installation: https://github.com/mhahsler/rBLAST/blob/devel/INSTALL)
#
#_libs
if (!require("rBLAST", quietly = TRUE)) {
BiocManager::install("rBLAST")
}
if (!require("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
library(stringr)
library(readr)
library(here)
library(rBLAST)
library(Biostrings)
#_input ----
## MIDORI2 "https://www.reference-midori.info/"
dl_midori_db <- TRUE
gb_version_date <- "263_2024-10-13" #required if dl_midori_db is TRUE
subdatabase <- "srRNA" #required if dl_midori_db is TRUE
## Database
db_fasta <- here::here("database", "database.fasta")
## Query
query_fasta <- here::here("data", "query_sequences.fasta")
## BLAST
min_cov <- 80
min_identity <- 60
blast_output <- here::here("blast.out")
#_function ----
download_midori_fasta <- function(
midori_url = "https://www.reference-midori.info/download/Databases/",
midori_gb_v_date,
midori_subdatabase,
out_fa,
dl_method = "curl",
midori_subdatabase_name = c(paste0("A", c(6,8)), paste0("CO", 1:3), paste0("ND", 1:6),
paste0(c("lr", "sr"), "RNA"), "Cytb", "ND4L")
) {
if (!midori_subdatabase %in% midori_subdatabase_name){
stop(paste(midori_subdatabase, " doesn't exist"))
}
midori_gb_v <- stringr::word(string = midori_gb_v_date, 1, 1, sep = "_")
dl_url <- paste0(midori_url, "GenBank", midori_gb_v_date,
"/RAW/total/MIDORI2_TOTAL_NUC_GB", midori_gb_v, "_",
midori_subdatabase, "_RAW.fasta.gz")
dir.create(path = dirname(out_fa), showWarnings = FALSE)
if (!file.exists(out_fa)) {
outfa_gz <- paste0(out_fa, ".gz")
download.file(url = dl_url, destfile = outfa_gz, method = dl_method)
system(command = paste("gunzip", outfa_gz))
}
}
#_main ----
## Download midori2 fasta file (genbank subdatabase) ----
if (dl_midori_db) {
download_midori_fasta(midori_gb_v_date = gb_version_date,
midori_subdatabase = subdatabase,
out_fa = db_fasta)
}
## Make BLAST database ----
rBLAST::makeblastdb(file = db_fasta, dbtype = "nucl")
## Run BLASTN ----
bl <- rBLAST::blast(db = db_fasta)
query_seq <- Biostrings::readDNAStringSet(filepath = query_fasta)
blast_res <- predict(bl, query_seq, BLAST_args = paste("-qcov_hsp_perc", min_cov, "-perc_identity", min_identity))
readr::write_csv(x = blast_res, file = blast_output)
Please register or sign in to comment