reformat_spygen_data.R

#
# Script Name: reformat_spygen_data.R
# Purpose of script: Reformat and merge SPYGEN excel files
# Author: Morgane Bruno
# Contact: morgane.bruno@cefe.cnrs.fr
# Licence: MIT
#

#_input ----

xl_files <- list.files(path = here::here("data"), full.names = TRUE)
names(xl_files) <- basename(xl_files)
output_file <- "2025_Pleo_all_results.csv"

#_fun ----

#' format_spygen_file
#'
#' @param excel_file SPYGEN excel file
#' @return A tibble with taxa as columns and samples as rows
#' @examples
#' format_spygen_file("spygen_results.xlsx")
format_spygen_file <- function(excel_file){
  message(excel_file)
  xl <- readxl::read_xlsx(path = excel_file, trim_ws = TRUE)
  colnames(xl) <- colnames(xl) |> toupper()
  # Create a correspondence table between sequencing run names and samples.
  # Prevents duplicates if a sample has been sequenced several times and,
  # allows downstream filtering, if necessary, of results corresponding to resequencing.
  corr_run_sample <- xl |>
    dplyr::filter(PROJECT == "Sample") |>
    dplyr::select(-(1:5)) |>
    tidyr::pivot_longer(cols = dplyr::everything(), names_to = "run", values_to = "sample") |>
    dplyr::mutate(run = stringr::str_remove(string = run, pattern = "\\.\\.\\.[0-9]+")) |>
    tidyr::drop_na()
  # torm = pattern used to remove unused metadata row, case and separator doesn't matter
  torm <- c("Sampling date", "Site name", "Site code", "class", "region", "country", "date") |>
    snakecase::to_snake_case()
  # Transpose excel table: inverts scientific_name and spygen_code
  xl |>
    dplyr::mutate(PROJECT = snakecase::to_snake_case(PROJECT)) |>
    dplyr::filter(!PROJECT %in% torm) |>
    janitor::row_to_names(row_number = 1) |>
    dplyr::select(scientific_name = 4, dplyr::starts_with("SPY")) |>
    dplyr::mutate(scientific_name = stringr::str_replace_all(string = scientific_name, pattern = " ", replacement = "_")) |>
    tidyr::pivot_longer(cols = -scientific_name, names_to = "spygen_code", values_to = "N_reads") |>
    dplyr::mutate(N_reads = as.numeric(N_reads)) |>
    tidyr::drop_na() |>
    tidyr::pivot_wider(names_from = "scientific_name", values_from = "N_reads", values_fill = 0) |>
    dplyr::left_join(y = corr_run_sample, by = dplyr::join_by(spygen_code == sample)) |>
    dplyr::relocate(run, .before = spygen_code)
}

#_main ----

all_data <- purrr::map(.x = xl_files, .f = format_spygen_file) |>
  dplyr::bind_rows(.id = "") |>
  dplyr::mutate_if(is.numeric, ~replace(., is.na(.), 0)) |>
  dplyr::distinct() |>
  dplyr::arrange(spygen_code)

readr::write_csv(x = all_data, file = output_file)