diff --git a/src/bin/aphid/main.rs b/src/bin/aphid/main.rs index d03727778af5d2a8e2eb118fb6c792a8643fc4ad..f3309c0aa165d7d80fca121ef1201ee21c2cfb92 100644 --- a/src/bin/aphid/main.rs +++ b/src/bin/aphid/main.rs @@ -64,7 +64,8 @@ fn run() -> Result<(), Error> { // Output full configuration detail. write_config(&output, &config, interner)?; - // Prepare collection of per-tree detailed data. + // Prepare output. + let mut global = init_global(&forest); let mut details = init_detail(&forest, interner); // Remove all non-designated species from the gene trees. @@ -74,11 +75,12 @@ fn run() -> Result<(), Error> { // Use this pass to calculate branches lengths means // and prepare the triplet information for later likelihood calculation. let (topology_included, (triplet_means_sum, rest_means_sum), local_triplets) = - topology_filter(&pruned, &mut details, &config, interner); + topology_filter(&pruned, &mut global, &mut details, &config, interner); // Geometry-filtering pass. let included = geometry_filter( &pruned, + &mut global, &mut details, topology_included, (triplet_means_sum, rest_means_sum), @@ -88,14 +90,16 @@ fn run() -> Result<(), Error> { display_summary(&included, &details, &config); // Extract the only information needed for likelihood calculation. - let triplets = final_tree_selection(included, local_triplets, &mut details); + let triplets = final_tree_selection(included, local_triplets, &mut global, &mut details); // Output full per-tree detail. write_detail(&output, &details)?; write_csv(&output, &details)?; // Fit the model. - learn(&triplets, &config)?; + learn(&triplets, &mut global, &config)?; + + write_global(&output, &global)?; Ok(()) } @@ -182,6 +186,13 @@ fn write_config(output: &Path, config: &Config, interner: &Interner) -> Result<( Ok(()) } +fn init_global<'i>(forest: &GenesForest) -> output::Global { + output::Global { + n_trees: forest.len(), + ..Default::default() // Fill further information later. + } +} + fn init_detail<'i>(forest: &GenesForest, interner: &'i Interner) -> Vec<detail::Tree<'i>> { forest .ids() @@ -216,6 +227,7 @@ fn prune( fn topology_filter<'i>( pruned: &[GeneTree], + global: &mut output::Global, details: &mut [detail::Tree<'i>], config: &Config, interner: &'i Interner, @@ -238,8 +250,12 @@ fn topology_filter<'i>( // Decide first filter based on topology. let topology = tree.topological_analysis(taxa); - // Collect detail. + // Collect detail, accumulate global. let (triplet, outgroup, treetop) = detail::resolve_topology(&topology, taxa, tom, interner); + global.n_excluded_triplets_topologies += usize::from(!triplet.included); + global.n_excluded_outgroup_topologies += usize::from(!outgroup.included); + global.n_excluded_treetops_topologies += + usize::from(!treetop.as_ref().map(|t| t.included).unwrap_or(true)); detail.triplet = triplet; detail.outgroup = outgroup; detail.top = treetop; @@ -276,6 +292,7 @@ fn topology_filter<'i>( " <g>{n_excluded}</> tree{s_were} excluded from analysis based on their topology.", s_were = if n_excluded > 1 { "s were" } else { " was" }, ); + if let Some(min) = unrl { cprintln!( " <g>{n_unresolved}</> included triplet{s_were} unresolved (internal length ⩽ {min}).", @@ -283,6 +300,12 @@ fn topology_filter<'i>( ); } + // Record global information. + global.n_excluded_topologies = n_excluded; + global.n_unresolved_triplets = n_unresolved; + global.mean_length_triplet = triplet_means_sum / global.n_trees as f64; + global.mean_length_outgroup_other = rest_means_sum / global.n_trees as f64; + ( included, (triplet_means_sum, rest_means_sum), @@ -292,6 +315,7 @@ fn topology_filter<'i>( fn geometry_filter( pruned: &[GeneTree], + global: &mut output::Global, details: &mut [detail::Tree], topology_included: Vec<usize>, (triplet_means_sum, rest_means_sum): (f64, f64), @@ -314,6 +338,10 @@ fn geometry_filter( let global_shape = triplet_means_sum / rest_means_sum; cprintln!(" Mean tree shape: <g>{global_shape}</>"); + global.imbalance = Some(imbalance); + global.triplet_longer = Some(triplet_longer); + global.shape = Some(global_shape); + for i in topology_included { let tree = &pruned[i]; let detail = &mut details[i]; @@ -352,12 +380,16 @@ fn geometry_filter( println!(" --> All trees kept."); } + global.n_excluded_geometries = n_excluded; + global.n_included_trees = geometry_included.len(); + geometry_included } fn final_tree_selection( included: Vec<usize>, mut local_triplets: Vec<Option<LocalGeneTriplet>>, + global: &mut output::Global, details: &mut [detail::Tree], ) -> Vec<GeneTriplet> { println!("Estimate relative mutation rates."); @@ -365,8 +397,9 @@ fn final_tree_selection( .iter() .map(|&i| details[i].mean_lengths.total.unwrap()) .summed_mean::<usize>(); - cprintln!(" mean branch length accross trees: l = <g>{global_mean}</>"); + global.mean_branch_length = global_mean; + let mut triplets = Vec::new(); for i in included { let local = local_triplets[i].take().unwrap(); // Exists since included. @@ -467,7 +500,11 @@ fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config cprintln!("==> <s,g>{n}</> tree{} kept for analysis.", s(n)); } -fn learn(triplets: &[GeneTriplet], config: &Config) -> Result<(), Error> { +fn learn( + triplets: &[GeneTriplet], + global: &mut output::Global, + config: &Config, +) -> Result<(), Error> { println!("\nLearning:"); let parms = &config.search.init_parms; @@ -485,6 +522,21 @@ fn learn(triplets: &[GeneTriplet], config: &Config) -> Result<(), Error> { regular: {r}\n\ tensors: {t}\n" ); + + global.estimate.ln_likelihood = opt_lnl; + global.estimate.parameters = opt; + + Ok(()) +} + +fn write_global(output: &Path, global: &output::Global) -> Result<(), Error> { + let path = output.join(file::GLOBAL); + cprintln!( + "Write global analysis results detail to <b>{}</>.", + path.to_relative()?.display() + ); + let mut file = File::create(path)?; + writeln!(file, "{:#}", json!(global))?; Ok(()) } diff --git a/src/model/parameters.rs b/src/model/parameters.rs index ae55e8f59c477120dd1183f456dd22f360d0fb50..81792b8a8761d927e029b7f9de59600296def71f 100644 --- a/src/model/parameters.rs +++ b/src/model/parameters.rs @@ -25,7 +25,7 @@ impl Num for ValGrad {} // - tau_1 <= tau_2 // - p_* <= 1 // - p_ab + p_ac + p_bc <= 1 -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Default)] pub struct Parameters<F: Num> { // Population size. pub(crate) theta: F, @@ -43,7 +43,7 @@ pub struct Parameters<F: Num> { // Parameters are stack-allocated, so don't overuse possible GF times. pub(crate) const MAX_N_GF_TIMES: usize = 3; -#[derive(Serialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Debug, Clone, PartialEq, Eq, Default)] #[serde(transparent)] pub struct GeneFlowTimes<F: Num>(pub(crate) ArrayVec<F, MAX_N_GF_TIMES>); diff --git a/src/output.rs b/src/output.rs index 42391064f1d4bded5470c8a102bfb91a18268cf6..7b7973f9933655ae4acbabf89e06f021d50d228f 100644 --- a/src/output.rs +++ b/src/output.rs @@ -3,12 +3,16 @@ mod config; pub mod csv; pub mod detail; +pub mod global; // Standard output filenames. pub mod file { pub const CONFIG: &str = "config.json"; + pub const GLOBAL: &str = "global.json"; pub const DETAIL: &str = "detail.json"; pub const CSV: &str = "trees.csv"; pub const MAIN_TRACE: &str = "search_global.csv"; pub const LINSEARCH_TRACE: &str = "search_detail.csv"; } + +pub use global::Global; diff --git a/src/output/detail.rs b/src/output/detail.rs index 412ac6ef157aeb0b7adef3317a2a3d59db9ae056..b5feebdb167d5d047e870bfe82d4e31aa3d131e1 100644 --- a/src/output/detail.rs +++ b/src/output/detail.rs @@ -13,7 +13,7 @@ use crate::{ BranchLength, LocalGeneTriplet, TopologicalAnalysis, }; -/// All information used or produced by aphid +/// All user-facing information used or produced by aphid /// regarding one particular gene tree. #[derive(Serialize, Default)] pub struct Tree<'i> { diff --git a/src/output/global.rs b/src/output/global.rs new file mode 100644 index 0000000000000000000000000000000000000000..c0975a20b33bcb4184798d9e37e52a7e0d53bfaf --- /dev/null +++ b/src/output/global.rs @@ -0,0 +1,75 @@ +// Use this structure to collect all analysis/run/top-level information +// susceptible to interest user after an aphid run. +// Export as a json file. + +use serde::Serialize; + +use crate::{BranchLength, Parameters}; + +/// All user-facing information used or produced by aphid +/// regarding the gene forest analyzed. +#[derive(Serialize, Default)] +pub struct Global { + /// The number of trees analyzed. + pub n_trees: usize, + + /// Number of triplets rejected based on their topology + /// (incomplete or paraphyletic). + pub n_excluded_triplets_topologies: usize, + + /// Number of triplets considered unresolved. + pub n_unresolved_triplets: usize, + + /// Number of outgroup rejected based on their topology + /// (empty or paraphyletic). + pub n_excluded_outgroup_topologies: usize, + + /// Number of tree tops rejected + /// (non-root LCA(triplet, outgroup)). + /// Trees already rejected based on their triplet or outgroup are not counted. + pub n_excluded_treetops_topologies: usize, + + /// Total number of trees excluded based on their topology. + pub n_excluded_topologies: usize, + + /// Mean branch length over the forest. + pub mean_branch_length: BranchLength, + + /// Mean length of a triplet, + /// calculated after the topology filter and before the geometry filter. + pub mean_length_triplet: BranchLength, + + /// Mean length of non-triplet branches ('other' and 'outgroup') + /// calculated after the topology filter and before the geometry filter. + pub mean_length_outgroup_other: BranchLength, + + /// Average imbalance between triplet branches lengths + /// and the outgroup + other sections branches lengths. + /// Only calculated if a maximum clock ratio is set. + pub imbalance: Option<f64>, + + /// True if the imbalance means that triplets are longer on average. + /// False if the outgroup + other sections are longer on average. + pub triplet_longer: Option<bool>, + + /// Overall ratio over the genes forest. + pub shape: Option<f64>, + + /// Number of trees excluded based on their geometry. + pub n_excluded_geometries: usize, + + /// Final number of trees kept for likelihood calculation. + pub n_included_trees: usize, + + /// Best parameters values found to maximize likelihood. + pub estimate: Estimate, +} + +#[derive(Serialize, Default)] +pub struct Estimate { + /// Best ln-likelihood value found. + pub ln_likelihood: f64, + + /// Corresponding parameters values. + pub parameters: Parameters<f64>, +}