From 1ab7383e751d79c4c34df6d8eb1b94efd3a1d7b9 Mon Sep 17 00:00:00 2001
From: Iago Bonnici <iago.bonnici@umontpellier.fr>
Date: Fri, 6 Sep 2024 19:44:03 +0200
Subject: [PATCH] Export scalar tree data as CSV.

---
 Cargo.toml            |   1 +
 src/bin/aphid/main.rs |  27 +++++--
 src/output.rs         |   1 +
 src/output/csv.rs     | 169 ++++++++++++++++++++++++++++++++++++++++++
 src/output/detail.rs  |   4 +-
 5 files changed, 195 insertions(+), 7 deletions(-)
 create mode 100644 src/output/csv.rs

diff --git a/Cargo.toml b/Cargo.toml
index c256cf0..692d900 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,6 +29,7 @@ snafu = "0.8.2"
 serde_json = { version = "1.0.127", features = ["preserve_order"] }
 color-print = { git = "https://gitlab.com/iago-lito/color-print", branch = "dev", version = "0.3.6" }
 color-print-proc-macro = { git = "https://gitlab.com/iago-lito/color-print", branch = "dev", version = "0.3.6" }
+csv = "1.3.0"
 
 [dev-dependencies]
 rand = "0.8.5"
diff --git a/src/bin/aphid/main.rs b/src/bin/aphid/main.rs
index bbeb082..617b7af 100644
--- a/src/bin/aphid/main.rs
+++ b/src/bin/aphid/main.rs
@@ -11,14 +11,14 @@ use aphid::{
     interner::{Interner, ResolvedSymbol, SpeciesSymbol},
     it_mean::SummedMean,
     ln_likelihood, optimize_likelihood,
-    output::detail,
+    output::{self, detail},
     Config, GeneTree, GeneTriplet, GenesForest, LocalGeneTriplet, VERSION,
 };
 use clap::Parser;
 use color_print::{ceprintln, cformat, cprintln};
 use float_eq::float_eq;
 use serde_json::json;
-use snafu::{ensure, Snafu};
+use snafu::{ensure, ResultExt, Snafu};
 
 mod args;
 
@@ -26,6 +26,7 @@ mod args;
 mod out {
     pub(super) const CONFIG: &str = "config.json";
     pub(super) const DETAIL: &str = "detail.json";
+    pub(super) const CSV: &str = "trees.csv";
 }
 
 // Run and terminate error bubbling if any.
@@ -91,6 +92,7 @@ fn run() -> Result<(), Error> {
 
     // Output full per-tree detail.
     write_detail(&output, &details)?;
+    write_csv(&output, &details)?;
 
     // Fit the model.
     learn(&triplets, &config)?;
@@ -248,7 +250,7 @@ fn topology_filter<'i>(
             triplet_means_sum += mean_triplet;
             rest_means_sum += mean_rest;
 
-            detail.included_topology = true;
+            detail.topology_included = true;
             detail.mean_lengths = detail::MeanLengths {
                 total: Some(mean_length),
                 triplet: Some(mean_triplet),
@@ -323,7 +325,7 @@ fn geometry_filter(
                 tree.is_shape_included(mean_triplet, mean_rest, global_shape, max);
             detail.local_shape = Some(shape_ratio);
             if pass {
-                detail.included_geometry = pass;
+                detail.geometry_included = pass;
                 included.push(i);
             } else {
                 n_excluded += 1;
@@ -384,6 +386,19 @@ fn write_detail(output: &Path, details: &[detail::Tree]) -> Result<(), Error> {
     Ok(())
 }
 
+fn write_csv(output: &Path, details: &[detail::Tree]) -> Result<(), Error> {
+    let path = output.join(out::CSV);
+    cprintln!("Summarize scalar values to <b>{}</>.", path.display());
+    let file = File::create(path)?;
+    let mut wtr = csv::Writer::from_writer(file);
+    for detail in details {
+        let record: output::csv::Record = detail.into();
+        wtr.serialize(record).context(CsvErr)?;
+    }
+    wtr.flush()?;
+    Ok(())
+}
+
 fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config) {
     println!("\nSummary:");
 
@@ -431,7 +446,7 @@ fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config
     if let Some(max) = config.filters.max_clock_ratio {
         let n = details
             .iter()
-            .map(|d| u64::from(!d.included_geometry))
+            .map(|d| u64::from(!d.geometry_included))
             .sum();
         cprintln!(
             "  - <g>{n}</> tree shape{} rejected \
@@ -483,4 +498,6 @@ enum Error {
     InputConsistency { mess: String },
     #[snafu(transparent)]
     Learn { source: aphid::learn::Error },
+    #[snafu(display("Could not serialize record to CSV:\n{source}"))]
+    Csv { source: csv::Error },
 }
diff --git a/src/output.rs b/src/output.rs
index 585d6fa..87533ae 100644
--- a/src/output.rs
+++ b/src/output.rs
@@ -2,3 +2,4 @@
 
 mod config;
 pub mod detail;
+pub mod csv;
diff --git a/src/output/csv.rs b/src/output/csv.rs
new file mode 100644
index 0000000..ef2e012
--- /dev/null
+++ b/src/output/csv.rs
@@ -0,0 +1,169 @@
+// Extract all scalar data from the detailed output
+// and fit it into a large csv table to ease parsing by biologists.
+
+use serde::Serialize;
+
+use super::detail::{Internal, MeanLengths, Outgroup, Top, Tree, Triplet, TripletAnalysis};
+use crate::{
+    gene_tree::{NbBases, TripletTopology},
+    BranchLength,
+};
+
+// One record per tree.
+#[derive(Serialize)]
+#[allow(clippy::struct_excessive_bools)] // On purpose for CSV splatting.
+pub struct Record<'i> {
+    // Global tree information.
+    tree_id: &'i str,
+    n_bases: NbBases,
+    n_nodes_raw: usize,
+    n_nodes_pruned: usize,
+
+    //----------------------------------------------------------------------------------------------
+    // Topology-related data.
+
+    // Triplet section.
+    triplet_n_missing: usize,
+    triplet_lca: Option<usize>,
+    triplet_n_paraphyletic: usize,
+    triplet_included: bool,
+    triplet_topology: Option<TripletTopology>,
+    triplet_branch_length_a: Option<NbBases>,
+    triplet_branch_length_b: Option<NbBases>,
+    triplet_branch_length_c: Option<NbBases>,
+    triplet_branch_length_d: Option<NbBases>,
+    triplet_resolved: Option<bool>,
+
+    // Outgroup section.
+    outgroup_n_missing: usize,
+    outgroup_lca: Option<usize>,
+    outgroup_n_paraphyletic: usize,
+    outgroup_included: bool,
+
+    // Tree top section.
+    top_lca: Option<usize>,
+    top_n_external: Option<usize>,
+    top_n_internal_triplet: Option<usize>,
+    top_n_internal_outgroup: Option<usize>,
+    top_included: Option<bool>,
+
+    topology_included: bool,
+
+    //----------------------------------------------------------------------------------------------
+    // Geometry-related data.
+    mean_lengths_triplet: Option<BranchLength>,
+    mean_lengths_outgroup_other: Option<BranchLength>,
+    mean_lengths_total: Option<BranchLength>,
+
+    local_shape: Option<BranchLength>,
+    geometry_included: bool,
+
+    //----------------------------------------------------------------------------------------------
+    mutation_rate: Option<f64>,
+}
+
+//==================================================================================================
+// Trivial extraction / flattening from the detailed tree information.
+
+impl<'i> From<&Tree<'i>> for Record<'i> {
+    fn from(tree: &Tree<'i>) -> Self {
+        // Destructure all.
+        let &Tree {
+            id: tree_id,
+            n_bases,
+            n_nodes_raw,
+            n_nodes_pruned,
+            triplet:
+                Triplet {
+                    lca: triplet_lca,
+                    missing: ref triplet_missing,
+                    paraphyletic: ref triplet_paraphyletic,
+                    analysis: ref triplet_analysis,
+                    included: triplet_included,
+                },
+            outgroup:
+                Outgroup {
+                    lca: outgroup_lca,
+                    missing: ref outgroup_missing,
+                    paraphyletic: ref outgroup_paraphyletic,
+                    included: outgroup_included,
+                },
+            ref top,
+
+            topology_included,
+            mean_lengths:
+                MeanLengths {
+                    total: mean_lengths_total,
+                    triplet: mean_lengths_triplet,
+                    outgroup_other: mean_lengths_outgroup_other,
+                },
+            local_shape,
+            geometry_included,
+            mutation_rate,
+        } = tree;
+
+        // Deeper into (optional) triplet information destructuring.
+        let (triplet_topology, tl, triplet_resolved) =
+            if let Some(TripletAnalysis { topology, branches_lengths: ref bl, resolved }) =
+                *triplet_analysis
+            {
+                (Some(topology), Some(bl), Some(resolved))
+            } else {
+                (None, None, None)
+            };
+
+        // Deeper into (optional) tree top information destructuring.
+        let (top_lca, top_n_external, int, top_included) =
+            if let Some(Top { lca, ref external, ref internal, included }) = *top {
+                (
+                    Some(lca),
+                    Some(external.len()),
+                    Some(internal),
+                    Some(included),
+                )
+            } else {
+                (None, None, None, None)
+            };
+
+        let (top_n_internal_triplet, top_n_internal_outgroup) =
+            if let Some(Some(Internal { triplet, outgroup })) = int {
+                (Some(triplet.len()), Some(outgroup.len()))
+            } else {
+                (None, None)
+            };
+
+        // All is eventually flattened.
+        Record {
+            tree_id,
+            n_bases,
+            n_nodes_raw,
+            n_nodes_pruned,
+            triplet_lca,
+            triplet_n_missing: triplet_missing.len(),
+            triplet_n_paraphyletic: triplet_paraphyletic.len(),
+            triplet_included,
+            triplet_topology,
+            triplet_branch_length_a: tl.map(|l| l.a),
+            triplet_branch_length_b: tl.map(|l| l.b),
+            triplet_branch_length_c: tl.map(|l| l.c),
+            triplet_branch_length_d: tl.map(|l| l.d),
+            triplet_resolved,
+            outgroup_lca,
+            outgroup_n_missing: outgroup_missing.len(),
+            outgroup_n_paraphyletic: outgroup_paraphyletic.len(),
+            outgroup_included,
+            top_lca,
+            top_n_external,
+            top_n_internal_triplet,
+            top_n_internal_outgroup,
+            top_included,
+            topology_included,
+            mean_lengths_triplet,
+            mean_lengths_outgroup_other,
+            mean_lengths_total,
+            local_shape,
+            geometry_included,
+            mutation_rate,
+        }
+    }
+}
diff --git a/src/output/detail.rs b/src/output/detail.rs
index 0228f7b..412ac6e 100644
--- a/src/output/detail.rs
+++ b/src/output/detail.rs
@@ -39,7 +39,7 @@ pub struct Tree<'i> {
     pub top: Option<Top<'i>>,
 
     /// Raised if the tree passed the topology filter.
-    pub included_topology: bool,
+    pub topology_included: bool,
 
     /// Mean branches lengths,
     /// undefined if the none of the species set
@@ -53,7 +53,7 @@ pub struct Tree<'i> {
     pub local_shape: Option<f64>,
 
     /// Raised if the tree passed the geometry filter.
-    pub included_geometry: bool,
+    pub geometry_included: bool,
 
     /// Estimated mutation rate for this tree.
     /// (No estimate if the tree was excluded from analysis.)
-- 
GitLab