Reformat and merge SPYGEN excel files
The snippet can be accessed without any authentication.
Authored by
mbruno
Edited
reformat_spygen_data.R 2.52 KiB
#
# Script Name: reformat_spygen_data.R
# Purpose of script: Reformat and merge SPYGEN excel files
# Author: Morgane Bruno
# Contact: morgane.bruno@cefe.cnrs.fr
# Licence: MIT
#
#_input ----
xl_files <- list.files(path = here::here("data"), full.names = TRUE)
names(xl_files) <- basename(xl_files)
output_file <- "2025_Pleo_all_results.csv"
#_fun ----
#' format_spygen_file
#'
#' @param excel_file SPYGEN excel file
#' @return A tibble with taxa as columns and samples as rows
#' @examples
#' format_spygen_file("spygen_results.xlsx")
format_spygen_file <- function(excel_file){
message(excel_file)
xl <- readxl::read_xlsx(path = excel_file, trim_ws = TRUE)
colnames(xl) <- colnames(xl) |> toupper()
# Create a correspondence table between sequencing run names and samples.
# Prevents duplicates if a sample has been sequenced several times and,
# allows downstream filtering, if necessary, of results corresponding to resequencing.
corr_run_sample <- xl |>
dplyr::filter(PROJECT == "Sample") |>
dplyr::select(-(1:5)) |>
tidyr::pivot_longer(cols = dplyr::everything(), names_to = "run", values_to = "sample") |>
dplyr::mutate(run = stringr::str_remove(string = run, pattern = "\\.\\.\\.[0-9]+")) |>
tidyr::drop_na()
# torm = pattern used to remove unused metadata row, case and separator doesn't matter
torm <- c("Sampling date", "Site name", "Site code", "class", "region", "country", "date") |>
snakecase::to_snake_case()
# Transpose excel table: inverts scientific_name and spygen_code
xl |>
dplyr::mutate(PROJECT = snakecase::to_snake_case(PROJECT)) |>
dplyr::filter(!PROJECT %in% torm) |>
janitor::row_to_names(row_number = 1) |>
dplyr::select(scientific_name = 4, dplyr::starts_with("SPY")) |>
dplyr::mutate(scientific_name = stringr::str_replace_all(string = scientific_name, pattern = " ", replacement = "_")) |>
tidyr::pivot_longer(cols = -scientific_name, names_to = "spygen_code", values_to = "N_reads") |>
dplyr::mutate(N_reads = as.numeric(N_reads)) |>
tidyr::drop_na() |>
tidyr::pivot_wider(names_from = "scientific_name", values_from = "N_reads", values_fill = 0) |>
dplyr::left_join(y = corr_run_sample, by = dplyr::join_by(spygen_code == sample)) |>
dplyr::relocate(run, .before = spygen_code)
}
#_main ----
all_data <- purrr::map(.x = xl_files, .f = format_spygen_file) |>
dplyr::bind_rows(.id = "") |>
dplyr::mutate_if(is.numeric, ~replace(., is.na(.), 0)) |>
dplyr::distinct() |>
dplyr::arrange(spygen_code)
readr::write_csv(x = all_data, file = output_file)
Please register or sign in to comment