From 1ab7383e751d79c4c34df6d8eb1b94efd3a1d7b9 Mon Sep 17 00:00:00 2001 From: Iago Bonnici <iago.bonnici@umontpellier.fr> Date: Fri, 6 Sep 2024 19:44:03 +0200 Subject: [PATCH] Export scalar tree data as CSV. --- Cargo.toml | 1 + src/bin/aphid/main.rs | 27 +++++-- src/output.rs | 1 + src/output/csv.rs | 169 ++++++++++++++++++++++++++++++++++++++++++ src/output/detail.rs | 4 +- 5 files changed, 195 insertions(+), 7 deletions(-) create mode 100644 src/output/csv.rs diff --git a/Cargo.toml b/Cargo.toml index c256cf0..692d900 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ snafu = "0.8.2" serde_json = { version = "1.0.127", features = ["preserve_order"] } color-print = { git = "https://gitlab.com/iago-lito/color-print", branch = "dev", version = "0.3.6" } color-print-proc-macro = { git = "https://gitlab.com/iago-lito/color-print", branch = "dev", version = "0.3.6" } +csv = "1.3.0" [dev-dependencies] rand = "0.8.5" diff --git a/src/bin/aphid/main.rs b/src/bin/aphid/main.rs index bbeb082..617b7af 100644 --- a/src/bin/aphid/main.rs +++ b/src/bin/aphid/main.rs @@ -11,14 +11,14 @@ use aphid::{ interner::{Interner, ResolvedSymbol, SpeciesSymbol}, it_mean::SummedMean, ln_likelihood, optimize_likelihood, - output::detail, + output::{self, detail}, Config, GeneTree, GeneTriplet, GenesForest, LocalGeneTriplet, VERSION, }; use clap::Parser; use color_print::{ceprintln, cformat, cprintln}; use float_eq::float_eq; use serde_json::json; -use snafu::{ensure, Snafu}; +use snafu::{ensure, ResultExt, Snafu}; mod args; @@ -26,6 +26,7 @@ mod args; mod out { pub(super) const CONFIG: &str = "config.json"; pub(super) const DETAIL: &str = "detail.json"; + pub(super) const CSV: &str = "trees.csv"; } // Run and terminate error bubbling if any. @@ -91,6 +92,7 @@ fn run() -> Result<(), Error> { // Output full per-tree detail. write_detail(&output, &details)?; + write_csv(&output, &details)?; // Fit the model. learn(&triplets, &config)?; @@ -248,7 +250,7 @@ fn topology_filter<'i>( triplet_means_sum += mean_triplet; rest_means_sum += mean_rest; - detail.included_topology = true; + detail.topology_included = true; detail.mean_lengths = detail::MeanLengths { total: Some(mean_length), triplet: Some(mean_triplet), @@ -323,7 +325,7 @@ fn geometry_filter( tree.is_shape_included(mean_triplet, mean_rest, global_shape, max); detail.local_shape = Some(shape_ratio); if pass { - detail.included_geometry = pass; + detail.geometry_included = pass; included.push(i); } else { n_excluded += 1; @@ -384,6 +386,19 @@ fn write_detail(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { Ok(()) } +fn write_csv(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { + let path = output.join(out::CSV); + cprintln!("Summarize scalar values to <b>{}</>.", path.display()); + let file = File::create(path)?; + let mut wtr = csv::Writer::from_writer(file); + for detail in details { + let record: output::csv::Record = detail.into(); + wtr.serialize(record).context(CsvErr)?; + } + wtr.flush()?; + Ok(()) +} + fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config) { println!("\nSummary:"); @@ -431,7 +446,7 @@ fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config if let Some(max) = config.filters.max_clock_ratio { let n = details .iter() - .map(|d| u64::from(!d.included_geometry)) + .map(|d| u64::from(!d.geometry_included)) .sum(); cprintln!( " - <g>{n}</> tree shape{} rejected \ @@ -483,4 +498,6 @@ enum Error { InputConsistency { mess: String }, #[snafu(transparent)] Learn { source: aphid::learn::Error }, + #[snafu(display("Could not serialize record to CSV:\n{source}"))] + Csv { source: csv::Error }, } diff --git a/src/output.rs b/src/output.rs index 585d6fa..87533ae 100644 --- a/src/output.rs +++ b/src/output.rs @@ -2,3 +2,4 @@ mod config; pub mod detail; +pub mod csv; diff --git a/src/output/csv.rs b/src/output/csv.rs new file mode 100644 index 0000000..ef2e012 --- /dev/null +++ b/src/output/csv.rs @@ -0,0 +1,169 @@ +// Extract all scalar data from the detailed output +// and fit it into a large csv table to ease parsing by biologists. + +use serde::Serialize; + +use super::detail::{Internal, MeanLengths, Outgroup, Top, Tree, Triplet, TripletAnalysis}; +use crate::{ + gene_tree::{NbBases, TripletTopology}, + BranchLength, +}; + +// One record per tree. +#[derive(Serialize)] +#[allow(clippy::struct_excessive_bools)] // On purpose for CSV splatting. +pub struct Record<'i> { + // Global tree information. + tree_id: &'i str, + n_bases: NbBases, + n_nodes_raw: usize, + n_nodes_pruned: usize, + + //---------------------------------------------------------------------------------------------- + // Topology-related data. + + // Triplet section. + triplet_n_missing: usize, + triplet_lca: Option<usize>, + triplet_n_paraphyletic: usize, + triplet_included: bool, + triplet_topology: Option<TripletTopology>, + triplet_branch_length_a: Option<NbBases>, + triplet_branch_length_b: Option<NbBases>, + triplet_branch_length_c: Option<NbBases>, + triplet_branch_length_d: Option<NbBases>, + triplet_resolved: Option<bool>, + + // Outgroup section. + outgroup_n_missing: usize, + outgroup_lca: Option<usize>, + outgroup_n_paraphyletic: usize, + outgroup_included: bool, + + // Tree top section. + top_lca: Option<usize>, + top_n_external: Option<usize>, + top_n_internal_triplet: Option<usize>, + top_n_internal_outgroup: Option<usize>, + top_included: Option<bool>, + + topology_included: bool, + + //---------------------------------------------------------------------------------------------- + // Geometry-related data. + mean_lengths_triplet: Option<BranchLength>, + mean_lengths_outgroup_other: Option<BranchLength>, + mean_lengths_total: Option<BranchLength>, + + local_shape: Option<BranchLength>, + geometry_included: bool, + + //---------------------------------------------------------------------------------------------- + mutation_rate: Option<f64>, +} + +//================================================================================================== +// Trivial extraction / flattening from the detailed tree information. + +impl<'i> From<&Tree<'i>> for Record<'i> { + fn from(tree: &Tree<'i>) -> Self { + // Destructure all. + let &Tree { + id: tree_id, + n_bases, + n_nodes_raw, + n_nodes_pruned, + triplet: + Triplet { + lca: triplet_lca, + missing: ref triplet_missing, + paraphyletic: ref triplet_paraphyletic, + analysis: ref triplet_analysis, + included: triplet_included, + }, + outgroup: + Outgroup { + lca: outgroup_lca, + missing: ref outgroup_missing, + paraphyletic: ref outgroup_paraphyletic, + included: outgroup_included, + }, + ref top, + + topology_included, + mean_lengths: + MeanLengths { + total: mean_lengths_total, + triplet: mean_lengths_triplet, + outgroup_other: mean_lengths_outgroup_other, + }, + local_shape, + geometry_included, + mutation_rate, + } = tree; + + // Deeper into (optional) triplet information destructuring. + let (triplet_topology, tl, triplet_resolved) = + if let Some(TripletAnalysis { topology, branches_lengths: ref bl, resolved }) = + *triplet_analysis + { + (Some(topology), Some(bl), Some(resolved)) + } else { + (None, None, None) + }; + + // Deeper into (optional) tree top information destructuring. + let (top_lca, top_n_external, int, top_included) = + if let Some(Top { lca, ref external, ref internal, included }) = *top { + ( + Some(lca), + Some(external.len()), + Some(internal), + Some(included), + ) + } else { + (None, None, None, None) + }; + + let (top_n_internal_triplet, top_n_internal_outgroup) = + if let Some(Some(Internal { triplet, outgroup })) = int { + (Some(triplet.len()), Some(outgroup.len())) + } else { + (None, None) + }; + + // All is eventually flattened. + Record { + tree_id, + n_bases, + n_nodes_raw, + n_nodes_pruned, + triplet_lca, + triplet_n_missing: triplet_missing.len(), + triplet_n_paraphyletic: triplet_paraphyletic.len(), + triplet_included, + triplet_topology, + triplet_branch_length_a: tl.map(|l| l.a), + triplet_branch_length_b: tl.map(|l| l.b), + triplet_branch_length_c: tl.map(|l| l.c), + triplet_branch_length_d: tl.map(|l| l.d), + triplet_resolved, + outgroup_lca, + outgroup_n_missing: outgroup_missing.len(), + outgroup_n_paraphyletic: outgroup_paraphyletic.len(), + outgroup_included, + top_lca, + top_n_external, + top_n_internal_triplet, + top_n_internal_outgroup, + top_included, + topology_included, + mean_lengths_triplet, + mean_lengths_outgroup_other, + mean_lengths_total, + local_shape, + geometry_included, + mutation_rate, + } + } +} diff --git a/src/output/detail.rs b/src/output/detail.rs index 0228f7b..412ac6e 100644 --- a/src/output/detail.rs +++ b/src/output/detail.rs @@ -39,7 +39,7 @@ pub struct Tree<'i> { pub top: Option<Top<'i>>, /// Raised if the tree passed the topology filter. - pub included_topology: bool, + pub topology_included: bool, /// Mean branches lengths, /// undefined if the none of the species set @@ -53,7 +53,7 @@ pub struct Tree<'i> { pub local_shape: Option<f64>, /// Raised if the tree passed the geometry filter. - pub included_geometry: bool, + pub geometry_included: bool, /// Estimated mutation rate for this tree. /// (No estimate if the tree was excluded from analysis.) -- GitLab