Skip to content
Snippets Groups Projects

Reformat and merge SPYGEN excel files

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by mbruno
    Edited
    reformat_spygen_data.R 2.52 KiB
    #
    # Script Name: reformat_spygen_data.R
    # Purpose of script: Reformat and merge SPYGEN excel files
    # Author: Morgane Bruno
    # Contact: morgane.bruno@cefe.cnrs.fr
    # Licence: MIT
    #
    
    #_input ----
    
    xl_files <- list.files(path = here::here("data"), full.names = TRUE)
    names(xl_files) <- basename(xl_files)
    output_file <- "2025_Pleo_all_results.csv"
    
    #_fun ----
    
    #' format_spygen_file
    #'
    #' @param excel_file SPYGEN excel file
    #' @return A tibble with taxa as columns and samples as rows
    #' @examples
    #' format_spygen_file("spygen_results.xlsx")
    format_spygen_file <- function(excel_file){
      message(excel_file)
      xl <- readxl::read_xlsx(path = excel_file, trim_ws = TRUE)
      colnames(xl) <- colnames(xl) |> toupper()
      # Create a correspondence table between sequencing run names and samples.
      # Prevents duplicates if a sample has been sequenced several times and,
      # allows downstream filtering, if necessary, of results corresponding to resequencing.
      corr_run_sample <- xl |>
        dplyr::filter(PROJECT == "Sample") |>
        dplyr::select(-(1:5)) |>
        tidyr::pivot_longer(cols = dplyr::everything(), names_to = "run", values_to = "sample") |>
        dplyr::mutate(run = stringr::str_remove(string = run, pattern = "\\.\\.\\.[0-9]+")) |>
        tidyr::drop_na()
      # torm = pattern used to remove unused metadata row, case and separator doesn't matter
      torm <- c("Sampling date", "Site name", "Site code", "class", "region", "country", "date") |>
        snakecase::to_snake_case()
      # Transpose excel table: inverts scientific_name and spygen_code
      xl |>
        dplyr::mutate(PROJECT = snakecase::to_snake_case(PROJECT)) |>
        dplyr::filter(!PROJECT %in% torm) |>
        janitor::row_to_names(row_number = 1) |>
        dplyr::select(scientific_name = 4, dplyr::starts_with("SPY")) |>
        dplyr::mutate(scientific_name = stringr::str_replace_all(string = scientific_name, pattern = " ", replacement = "_")) |>
        tidyr::pivot_longer(cols = -scientific_name, names_to = "spygen_code", values_to = "N_reads") |>
        dplyr::mutate(N_reads = as.numeric(N_reads)) |>
        tidyr::drop_na() |>
        tidyr::pivot_wider(names_from = "scientific_name", values_from = "N_reads", values_fill = 0) |>
        dplyr::left_join(y = corr_run_sample, by = dplyr::join_by(spygen_code == sample)) |>
        dplyr::relocate(run, .before = spygen_code)
    }
    
    #_main ----
    
    all_data <- purrr::map(.x = xl_files, .f = format_spygen_file) |>
      dplyr::bind_rows(.id = "") |>
      dplyr::mutate_if(is.numeric, ~replace(., is.na(.), 0)) |>
      dplyr::distinct() |>
      dplyr::arrange(spygen_code)
    
    readr::write_csv(x = all_data, file = output_file)
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment