diff --git a/src/bin/aphid/args.rs b/src/bin/aphid/args.rs new file mode 100644 index 0000000000000000000000000000000000000000..a2998947b24073f2d07bb239023b673b7bbe7922 --- /dev/null +++ b/src/bin/aphid/args.rs @@ -0,0 +1,18 @@ +use std::path::PathBuf; + +use clap::Parser; + +/// Aphid: distinguishing gene flow from incomplete lineage sorting. +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +pub(crate) struct Args { + /// Raise to overwrite previous output folder. + #[clap(short, long)] + pub(crate) force: bool, + + /// Path to the config file. + pub(crate) config: PathBuf, + + /// Path to the output folder, created if missing. + pub(crate) output: PathBuf, +} diff --git a/src/bin/aphid/display_tree_analysis.rs b/src/bin/aphid/display_tree_analysis.rs deleted file mode 100644 index 963d213408b5c2f321c367f87d369e71e505639c..0000000000000000000000000000000000000000 --- a/src/bin/aphid/display_tree_analysis.rs +++ /dev/null @@ -1,205 +0,0 @@ -use aphid::{imbalance, BranchLength, Filters}; - -use super::{ - justify, ArrayVec, Colorize, GeneTree, Interner, ResolvedSymbol, SectionAnalysis, SectionLca, - ToString, TopologicalAnalysis, TreeSymbol, -}; - -//================================================================================================== -// Display tree analysis results for the first, topological pass, -// one line per "filter" (loosely defined yet). -// Return false for every filter not passed. -// This function is essentially a zoom back into the filtering process. -// Con 1: it duplicates most of the logic there. -// Pro 1: this is a good opportunity to keep the filtering procedure *modular* -// so that this fine-grained breakdown remains possible, -// which in turns improves future flexibility. -// Pro 2: this is not strictly necessary for the procedure to run, -// so it can just be turned off if performances eventually get in the way. -#[allow(clippy::too_many_lines)] // Ongoing experiment. -pub(super) fn display_topological_tree_analysis( - id: TreeSymbol, - raw: &GeneTree, - pruned: &GeneTree, - filters: &Filters, - analysis: &TopologicalAnalysis, - included: bool, - interner: &Interner, -) -> (bool, bool, Option<bool>) { - use SectionAnalysis as S; - let header = format!("{} nodes, pruned to {}.", raw.len(), pruned.len()); - let resolve = |vec: &[_]| { - vec.iter() - .map(|&sp| ResolvedSymbol::new(sp, interner)) - .collect::<Vec<_>>() - }; - - //---------------------------------------------------------------------------------------------- - // Retrieve (ok, message) status for every "filter" (loosely defined). - - // Triplet filter. - let triplet = match &analysis.triplet { - S::AllMissing => (false, "None found.".into()), - S::MissingSome(missing, _) => (false, format!("Missing species: {:?}.", resolve(missing))), - S::AllFound(SectionLca { id, paraphyletic }) => { - if paraphyletic.is_empty() { - (true, format!("LCA found: {id}.")) - } else { - let names = resolve(paraphyletic); - (false, format!("Paraphyletic with {names:?}.")) - } - } - }; - - // Outgroup filter. - let mut outgroup = { - let mut lines = ArrayVec::<_, 2>::new(); - match &analysis.outgroup { - S::AllMissing => lines.push((false, "None found.".into())), - S::MissingSome(missing, _) => { - lines.push((true, format!("Missing species: {:?}.", resolve(missing)))); - } - S::AllFound(_) => {} - } - match &analysis.outgroup { - S::AllMissing => {} - S::MissingSome(_, lca) | S::AllFound(lca) => { - let SectionLca { id, paraphyletic } = lca; - lines.push(if paraphyletic.is_empty() { - (true, format!("LCA found: {id}.")) - } else { - let names = resolve(paraphyletic); - (false, format!("Paraphyletic with {names:?}.")) - }); - } - } - lines - }; - outgroup.sort_unstable_by_key(|o| o.0); // Error first, if any. - - // Treetop filter. - let mut top = analysis.top.as_ref().map(|top| { - if top.external.is_empty() { - if let Some(internals) = &top.internal { - if filters.triplet_other_monophyly && !internals.outgroup_side().is_empty() { - let names = resolve(internals.outgroup_side()); - ( - false, - format!("LCA(triplet, other) paraphyletic with: {names:?}"), - ) - } else { - let names = resolve(&internals.species); - (true, format!("Other species found: {names:?}")) - } - } else { - (true, "No other species remain.".into()) - } - } else { - let names = resolve(&top.external); - (false, format!("External species found: {names:?}.")) - } - }); - - //---------------------------------------------------------------------------------------------- - // Display. - - // Header line. - let check = if included { "✔".green() } else { "🗙".red() }; - let name = interner.resolve(id).unwrap(); - eprintln!("{check} {}: {header}", name.blue()); - let indent = " "; - let tab = |p: &str| justify(p, 12, " ", '>'); - - // Triplet line. - let (ok, mut message) = triplet; - let mut prefix = String::from("triplet:"); - if !ok { - prefix = prefix.red().string(); - message = message.italic().string(); - } - eprintln!("{indent}{} {message}", tab(&prefix)); - - // Outgroup lines. - let mut prefix = String::from("outgroup:"); - if !outgroup.iter().all(|o| o.0) { - prefix = prefix.red().string(); - } - eprint!("{indent}{}", tab(&prefix)); - for &(ok, ref message) in &outgroup { - if ok { - eprint!(" {message}"); - } else { - eprint!(" {}", message.italic()); - } - } - eprintln!(); - - // Treetop line. - if let Some(t) = &mut top { - let &mut (ref ok, ref mut message) = t; - let mut a = String::from("treetop:"); - if !ok { - a = a.red().string(); - *message = message.italic().string(); - } - eprintln!("{indent}{} {message}", tab(&a)); - } - - (triplet.0, outgroup.iter().all(|o| o.0), top.map(|t| t.0)) -} - -//================================================================================================== -// Same, for the second, geometrical pass. -pub(super) fn display_geometrical_tree_analysis( - id: TreeSymbol, - gtree: &GeneTree, - (mean_triplet, mean_rest): (BranchLength, BranchLength), - global_shape: f64, - max_clock_ratio: f64, - included: bool, - interner: &Interner, -) { - // Loosely fit to previous formatting, but there is only one filter for now. - // TODO: factorize formatting out if it grows. - - let clock_ratio = { - let (triplet_longer, imbalance) = imbalance(mean_triplet, mean_rest); - let (t, r) = ("'triplet' section", "'outgroup' and 'other' section"); - let (small, large) = if triplet_longer { (t, r) } else { (r, t) }; - let (pass, sratio) = - gtree.is_shape_included(mean_triplet, mean_rest, global_shape, max_clock_ratio); - if pass { - ( - true, - format!( - "Branch lengths in {large} are only {imbalance:.3} times longer than in {small}: \ - shape ratio = {sratio:.3} < {max_clock_ratio:.3}." - ), - ) - } else { - ( - false, - format!( - "Branch lengths in {large} are {imbalance:.3} times longer than in {small}: \ - shape ratio = {sratio:.3} ⩾ {max_clock_ratio:.3}." - ), - ) - } - }; - - // Header line. - let check = if included { "✔".green() } else { "🗙".red() }; - let name = interner.resolve(id).unwrap(); - eprintln!("{check} {}:", name.blue()); - let indent = " "; - let tab = |p: &str| justify(p, 15, " ", '>'); - - // Clock ratio line. - let (ok, mut message) = clock_ratio; - let mut prefix = String::from("clock ratio:"); - if !ok { - prefix = prefix.red().string(); - message = message.italic().string(); - } - eprintln!("{indent}{} {message}", tab(&prefix)); -} diff --git a/src/bin/aphid/justify.rs b/src/bin/aphid/justify.rs deleted file mode 100644 index aa00ffd76a2215dece10eef64783fc5e84e7f9ea..0000000000000000000000000000000000000000 --- a/src/bin/aphid/justify.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Small util to justify text with possible non-trivial unicode widths -// and terminal escape codes. - -use unicode_width::UnicodeWidthStr; - -pub(super) fn justify(input: &str, n: usize, fill: &str, dir: char) -> String { - debug_assert!(terminal_width(fill) == 1); - let w = terminal_width(input); - if n > w { - let d = n - w; - match dir { - '>' => format!("{}{input}", fill.repeat(d)), - '<' => format!("{input}{}", fill.repeat(d)), - '^' => { - let (d, r) = (d / 2, d % 2); - format!("{}{input}{}", fill.repeat(d), fill.repeat(d + r)) - } - _ => panic!("Invalid text justification directive: {dir:?}."), - } - } else { - input.into() - } -} - -// String width, not counting terminal escape codes, but taking utf8 widths into account. -pub(super) fn terminal_width(input: &str) -> usize { - let stripped = strip_ansi_escapes::strip(input.as_bytes()); - let stripped = String::from_utf8(stripped).expect("Stripping escapes lead to invalid utf8?"); - UnicodeWidthStr::width(stripped.as_str()) -} - -#[cfg(test)] -mod tests { - use colored::Colorize; - - use super::*; - - #[test] - fn justify_strings() { - let inputs = [ - format!( - "{} {} {} {}", - "✔".green(), - "A".italic(), - "🗙".red(), - "B".bold() - ), - "V A X B".into(), // Should be the same. - ]; - for input in inputs { - assert_eq!(terminal_width(&input), 7); - assert_eq!(justify(&input, 7, "-", '>'), input); - assert_eq!(justify(&input, 8, "-", '>'), format!("-{input}")); - assert_eq!(justify(&input, 8, "-", '<'), format!("{input}-")); - assert_eq!(justify(&input, 8, "-", '^'), format!("{input}-")); - assert_eq!(justify(&input, 9, "-", '>'), format!("--{input}")); - assert_eq!(justify(&input, 9, "-", '<'), format!("{input}--")); - assert_eq!(justify(&input, 9, "-", '^'), format!("-{input}-")); - } - } -} diff --git a/src/bin/aphid/main.rs b/src/bin/aphid/main.rs index e88d39866e44f1cd6d7ac9d24a1ddae6bb640b9b..415a42f1a9d4fe8ef4fa379d715b9fe47cc2b06c 100644 --- a/src/bin/aphid/main.rs +++ b/src/bin/aphid/main.rs @@ -1,62 +1,39 @@ -// Main entry point into the CLI. - use std::{ - cmp::max, collections::HashSet, - fmt::Write as FmtWrite, fs::{self, File}, io::Write as IoWrite, - path::{self, PathBuf}, + path::{self, Path, PathBuf}, process, }; use aphid::{ extract_local_triplet, imbalance, - interner::{Interner, ResolvedSymbol, TreeSymbol}, + interner::{Interner, ResolvedSymbol, SpeciesSymbol}, it_mean::SummedMean, ln_likelihood, optimize_likelihood, - topological_analysis::{SectionAnalysis, SectionLca}, - Config, GeneTree, GeneTriplet, GenesForest, TopologicalAnalysis, VERSION, + output::detail, + Config, GeneTree, GeneTriplet, GenesForest, LocalGeneTriplet, VERSION, }; -use arrayvec::ArrayVec; use clap::Parser; -use colored::{ColoredString, Colorize}; -use lexical_sort::natural_lexical_cmp; - -mod display_tree_analysis; -mod justify; -use display_tree_analysis::display_topological_tree_analysis; +use colored::Colorize; use float_eq::float_eq; -use justify::{justify, terminal_width}; +use serde_json::json; use snafu::{ensure, Snafu}; -use crate::display_tree_analysis::display_geometrical_tree_analysis; - -/// Aphid: distinguishing gene flow from incomplete lineage sorting. -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -struct Args { - /// Raise to overwrite previous output folder. - #[clap(short, long)] - force: bool, - - /// Path to the config file. - config: PathBuf, - - /// Path to the output folder, created if missing. - output: PathBuf, -} +mod args; // Standard output filenames. mod out { pub(super) const CONFIG: &str = "config.json"; + pub(super) const DETAIL: &str = "detail.json"; } +// Run and terminate error bubbling if any. fn main() { - eprintln!("Running Aphid v{VERSION}."); + println!("Running Aphid v{VERSION}."); match run() { Ok(()) => { - eprintln!("Success. {}", "✓".bold().green()); + println!("Success. {}", "✓".bold().green()); } Err(e) => { eprintln!("{} {e}", "🗙 Error:".bold().red()); @@ -65,44 +42,92 @@ fn main() { } } -#[allow(clippy::too_many_lines)] // TODO: factorize later: ongoing experiment. fn run() -> Result<(), Error> { - //============================================================================== - // Init. + // Parse command-line arguments. + let args = args::Args::parse(); - let args = Args::parse(); - let separator = || eprintln!("\n{:=<80}", ""); - - eprintln!("Prepare string interner."); + // Initialize string interning optimisation. let mut interner = Interner::new(); let interner = &mut interner; - //============================================================================== - // Read. + // Read data from the various files. + let (config, forest) = read_inputs(&args, interner)?; + + // Check overall consistency. + // Extract the set of designated species = 'triplet' + 'outgroup' + 'other'. + let designated_species = check_input_consistency(&forest, &config, interner)?; + + // Prepare output folder. + let output = prepare_output(&args)?; + + // Output full configuration detail. + write_config(&output, &config, interner)?; + + // Prepare collection of per-tree detailed data. + let mut details = init_detail(&forest, interner); + + // Remove all non-designated species from the gene trees. + let pruned = prune(&forest, &mut details, &designated_species); + // Topology-filtering pass. + // Use this pass to calculate branches lengths means + // and prepare the triplet information for later likelihood calculation. + let (topology_included, (triplet_means_sum, rest_means_sum), local_triplets) = + topology_filter(&pruned, &mut details, &config, interner); + + // Geometry-filtering pass. + let included = geometry_filter( + &pruned, + &mut details, + topology_included, + (triplet_means_sum, rest_means_sum), + &config, + ); + + display_summary(&included, &details, &config); + + // Extract the only information needed for likelihood calculation. + let triplets = final_tree_selection(included, local_triplets, &mut details); + + // Output full per-tree detail. + write_detail(&output, &details)?; + + // Fit the model. + learn(&triplets, &config)?; + + Ok(()) +} + +fn read_inputs(args: &args::Args, interner: &mut Interner) -> Result<(Config, GenesForest), Error> { let path = &args.config; - eprintln!("Read config from {}.", format!("{path:?}").blue()); + println!("Read config from {}.", format!("{path:?}").blue()); let config = Config::from_file(path, interner)?; let path = &config.trees; - eprintln!("Read gene trees from {}.", format!("{path:?}").blue()); + println!("Read gene trees from {}:", format!("{path:?}").blue()); let forest = GenesForest::from_file(path, interner)?; - eprintln!(" Found {} gene trees.", forest.len()); + println!(" Found {} gene trees.", forest.len()); - //============================================================================== - // Check. + Ok((config, forest)) +} - eprintln!("Check input consistency."); +fn check_input_consistency( + forest: &GenesForest, + config: &Config, + interner: &mut Interner, +) -> Result<HashSet<SpeciesSymbol>, Error> { + println!("Check input consistency."); - // Check that all relevant species are actually found in the forest, + // Check that all designated species are actually found in the forest, // or they may be mispelled. - let relevant_species = config.relevant_species().collect::<HashSet<_>>(); - let mut not_found = relevant_species.clone(); + let designated_species = config.designated_species().collect::<HashSet<_>>(); + let mut not_found = designated_species.clone(); for gtree in forest.trees() { for sp in gtree.species_symbols() { not_found.remove(&sp); } } + ensure!( not_found.is_empty(), InputConsistencyErr { @@ -117,371 +142,291 @@ fn run() -> Result<(), Error> { } ); - //============================================================================== - // Prepare output. + Ok(designated_species) +} + +fn prepare_output(args: &args::Args) -> Result<PathBuf, Error> { + println!("Prepare output folder:"); - eprintln!("Prepare output folder."); - let output = &path::absolute(&args.output)?; + let output = path::absolute(&args.output)?; match (output.exists(), args.force) { (true, true) => { - eprintln!( + println!( " {} existing folder: {}.", "Removing".yellow(), format!("{}", output.display()).blue() ); - fs::remove_dir_all(output)?; + fs::remove_dir_all(&output)?; } - (true, false) => OutputExistsErr { path: output }.fail()?, + (true, false) => OutputExistsErr { path: &output }.fail()?, (false, _) => { - eprintln!( + println!( " Creating empty folder: {}.", format!("{}", output.display()).blue() ); } }; - fs::create_dir_all(output)?; + fs::create_dir_all(&output)?; - //============================================================================== - // Export full config specification, including implicit defaults. + Ok(output) +} +fn write_config(output: &Path, config: &Config, interner: &Interner) -> Result<(), Error> { let path = output.join(out::CONFIG); - eprintln!( - "Write full configuration to {}.", + println!( + " Write full configuration to {}.", format!("{}", path.display()).blue() ); let mut file = File::create(path)?; writeln!(file, "{:#}", config.resolve(interner).json())?; + Ok(()) +} - //============================================================================== - // Extract various information from the trees, - // and handle data from them with some kind of 'dataframe' pattern: - // an informal collection of same-sized columns. - let ids = forest.ids(); - let raw_trees = forest.trees(); - let raw_n_nodes = raw_trees.iter().map(GeneTree::len).collect::<Vec<_>>(); - let sequences_lengths = raw_trees +fn init_detail<'i>(forest: &GenesForest, interner: &'i Interner) -> Vec<detail::Tree<'i>> { + forest + .ids() .iter() - .map(GeneTree::sequence_length) - .collect::<Vec<_>>(); - - //---------------------------------------------------------------------------------------------- - eprintln!("Prune trees so they only contain relevant species."); - let (pruned_trees, pruned_n_nodes) = raw_trees.iter().fold( - (Vec::new(), Vec::new()), - |(mut trees, mut n_nodes), raw_tree| { - let pruned = raw_tree.pruned(|_, leaf| relevant_species.contains(&leaf.payload)); - n_nodes.push(pruned.len()); - trees.push(pruned); - (trees, n_nodes) - }, - ); + .zip(forest.trees().iter()) + .map(|(&id, tree)| detail::Tree { + id: interner.resolve(id).unwrap(), + n_bases: tree.sequence_length(), + n_nodes_raw: tree.len(), + ..Default::default() // Fill further information later. + }) + .collect() +} - //---------------------------------------------------------------------------------------------- - separator(); - eprintln!("Filter trees based on topology.\n"); - let mut n_excluded = 0; - let mut n_unresolved = 0; +fn prune( + forest: &GenesForest, + details: &mut [detail::Tree], + designated_species: &HashSet<SpeciesSymbol>, +) -> Vec<aphid::GeneTree> { + println!("Prune trees so they only contain designated species."); + forest + .trees() + .iter() + .zip(details.iter_mut()) + .map(|(raw, detail)| { + let pruned = raw.pruned(|_, leaf| designated_species.contains(&leaf.payload)); + detail.n_nodes_pruned = pruned.len(); + pruned + }) + .collect() +} - // Leave None in place to keep same-sized columns. - let mut passed_triplets = Vec::new(); - let mut passed_outgroup = Vec::new(); - let mut passed_top = Vec::new(); +fn topology_filter<'i>( + pruned: &[GeneTree], + details: &mut [detail::Tree<'i>], + config: &Config, + interner: &'i Interner, +) -> (Vec<usize>, (f64, f64), Vec<Option<aphid::LocalGeneTriplet>>) { + println!("Filter trees based on topology:"); let mut included = Vec::new(); + let mut n_excluded = 0; + let mut n_unresolved = 0; // Not counting excluded ones. - // If required, prepare for the second branches lengths based filter. - let max_clock_ratio = config.filters.max_clock_ratio; - let mut triplet_rest_sum_means = max_clock_ratio.is_some().then_some((0., 0.)); - let mut shapes = Vec::new(); - let mut shape_ratios = Vec::new(); + // Use this pass over trees to prepare next upcoming *geometry* filter. + let mut triplet_means_sum = 0.; + let mut rest_means_sum = 0.; - for i in 0..pruned_trees.len() { - let pruned = &pruned_trees[i]; - let id = ids[i]; - let raw = &raw_trees[i]; + let taxa = &config.taxa; + let tom = config.filters.triplet_other_monophyly; + let unrl = config.unresolved_length; + let mut local_triplets = Vec::new(); + for (i, (tree, detail)) in pruned.iter().zip(details.iter_mut()).enumerate() { // Decide first filter based on topology. - let topology = pruned.topological_analysis(&config.taxa); - let pass = topology.is_included(config.filters.triplet_other_monophyly); - n_excluded += usize::from(!pass); - - let (triplet, outgroup, treetop) = display_topological_tree_analysis( - id, - raw, - pruned, - &config.filters, - &topology, - pass, - interner, - ); - - // Prepare possible second filter pass by calculating geometrical properties. - if let Some((triplet_means_sum, rest_means_sum)) = &mut triplet_rest_sum_means { - shapes.push(pass.then(|| { - let (mean_triplet, mean_rest) = pruned.triplet_rest_mean_lengths(&topology); - *triplet_means_sum += mean_triplet; - *rest_means_sum += mean_rest; - (mean_triplet, mean_rest) - })); + let topology = tree.topological_analysis(taxa); + + // Collect detail. + let (triplet, outgroup, treetop) = detail::resolve_topology(&topology, taxa, tom, interner); + detail.triplet = triplet; + detail.outgroup = outgroup; + detail.top = treetop; + + // Synthetize triplet information for later likelihood calculation. + let loc = extract_local_triplet(tree, &topology, &taxa.triplet, unrl); + detail.triplet.analysis = loc.as_ref().map(detail::TripletAnalysis::new); + + if topology.is_included(tom) { + let mean_length = tree.mean_branch_length(tree.root()); + let (mean_triplet, mean_rest) = tree.triplet_rest_mean_lengths(&topology); + + triplet_means_sum += mean_triplet; + rest_means_sum += mean_rest; + + detail.included_topology = true; + detail.mean_lengths = detail::MeanLengths { + total: Some(mean_length), + triplet: Some(mean_triplet), + outgroup_other: Some(mean_rest), + }; + + n_unresolved += usize::from(!loc.as_ref().unwrap().resolved); + included.push(i); + } else { + n_excluded += 1; } - // If the triplet filter passed, analyze its topology wrt species tree. - let triplet = triplet.then(|| { - let trip = extract_local_triplet( - pruned, - &topology, - &config.taxa.triplet, - config.unresolved_length, - ); - n_unresolved += u64::from(!trip.resolved); - trip - }); - - passed_triplets.push(triplet); - passed_outgroup.push(outgroup); - passed_top.push(treetop); - included.push(pass.then_some(pruned)); + local_triplets.push(loc); } - eprintln!( - " --> {n_excluded} tree{} excluded from analysis based on their topology.", - if n_excluded > 1 { "s were" } else { " was" } + // Report. + println!( + " {n_excluded} tree{s_were} excluded from analysis based on their topology.", + s_were = if n_excluded > 1 { "s were" } else { " was" }, ); - if let Some(min) = config.unresolved_length { - eprintln!( - " --> {n_unresolved} included triplet{} unresolved (internal length <= {min}).", - if n_unresolved > 1 { "s were" } else { " was" } + if let Some(min) = unrl { + println!( + " {n_unresolved} included triplet{s_were} unresolved (internal length <= {min}).", + s_were = if n_unresolved > 1 { "s were" } else { " was" }, ); } - //---------------------------------------------------------------------------------------------- - // Geometry filter. - - let geometry_pass = max_clock_ratio.is_some(); // (TODO: is this flag actually needed?) + ( + included, + (triplet_means_sum, rest_means_sum), + local_triplets, + ) +} - if geometry_pass { - separator(); - eprintln!("Filter trees based on geometry.\n"); - let (sum_mean_triplets, sum_mean_rest) = triplet_rest_sum_means.unwrap(); - let max = max_clock_ratio.unwrap(); +fn geometry_filter( + pruned: &[GeneTree], + details: &mut [detail::Tree], + topology_included: Vec<usize>, + (triplet_means_sum, rest_means_sum): (f64, f64), + config: &Config, +) -> Vec<usize> { + println!("Filter trees based on geometry."); + + let (geometry_included, n_excluded) = if let Some(max) = config.filters.max_clock_ratio { + let mut included = Vec::new(); + let mut n_excluded = 0; - let (triplet_longer, imbalance) = imbalance(sum_mean_triplets, sum_mean_rest); + // Calculate global mean tree shape. + let (triplet_longer, imbalance) = imbalance(triplet_means_sum, rest_means_sum); let (t, r) = ("'triplet' section", "'outgroup' and 'other' section"); let (small, large) = if triplet_longer { (t, r) } else { (r, t) }; - eprintln!( + println!( " Branch lengths in {large} are on average {imbalance} times longer \ than in {small}." ); - let global_shape = sum_mean_triplets / sum_mean_rest; - eprintln!(" Mean tree shape: {global_shape}"); - - let mut n_excluded = 0; - for i in 0..shapes.len() { - if let Some(gtree) = included[i] { - let id = ids[i]; - let (mean_triplet, mean_rest) = shapes[i].unwrap(); - let (pass, shape_ratio) = - gtree.is_shape_included(mean_triplet, mean_rest, global_shape, max); - display_geometrical_tree_analysis( - id, - gtree, - (mean_triplet, mean_rest), - global_shape, - max, - pass, - interner, - ); - if !pass { - included[i] = None; - n_excluded += 1; - } - shape_ratios.push(Some((pass, shape_ratio))); + let global_shape = triplet_means_sum / rest_means_sum; + println!(" Mean tree shape: {global_shape}"); + + for i in topology_included { + let tree = &pruned[i]; + let detail = &mut details[i]; + + let detail::MeanLengths { + triplet: Some(mean_triplet), + outgroup_other: Some(mean_rest), + .. + } = detail.mean_lengths + else { + unreachable!() + }; + let (pass, shape_ratio) = + tree.is_shape_included(mean_triplet, mean_rest, global_shape, max); + detail.local_shape = Some(shape_ratio); + if pass { + detail.included_geometry = pass; + included.push(i); } else { - shape_ratios.push(None); + n_excluded += 1; } } - eprintln!( - " --> {n_excluded} tree{} excluded from analysis based on their geometry.", - if n_excluded > 1 { "s were" } else { " was" } + + (included, n_excluded) + } else { + (topology_included, 0) + }; + + // Report. + if n_excluded > 0 { + println!( + " --> {n_excluded} tree{s_were} excluded from analysis based on their geometry.", + s_were = if n_excluded > 1 { "s were" } else { " was" }, ); + } else { + println!(" --> All trees kept."); } - //---------------------------------------------------------------------------------------------- - separator(); - eprintln!("Calculate mean branches lengths per filtered tree."); - let mean_lengths = included - .iter() - .map(|tree| tree.map(|t| t.mean_branch_length(t.root()))) - .collect::<Vec<_>>(); + geometry_included +} - eprintln!("Estimate relative mutation rates."); - let global_mean = mean_lengths +fn final_tree_selection( + included: Vec<usize>, + mut local_triplets: Vec<Option<LocalGeneTriplet>>, + details: &mut [detail::Tree], +) -> Vec<GeneTriplet> { + println!("Estimate relative mutation rates."); + let global_mean = included .iter() - .filter_map(Option::clone) + .map(|&i| details[i].mean_lengths.total.unwrap()) .summed_mean::<usize>(); - eprintln!(" mean branch length accross trees: l = {global_mean}"); - let mutation_rates = mean_lengths - .iter() - .map(|o| o.map(|local_mean| local_mean / global_mean)) - .collect::<Vec<_>>(); - - //============================================================================== - // Display as a table. - separator(); - eprintln!("Pre-analysis results:\n"); - - // Column names and alignment. - let mut columns = vec![ - ("name", '<'), - ("bases", '>'), - ("raw", '>'), - ("pruned", '>'), - ("trip", '^'), - ("out", '^'), - ("top", '^'), - ]; - if max_clock_ratio.is_some() { - columns.push(("sratio", '>')); - } - columns.extend_from_slice(&[("l_i", '>'), ("α_i", '>')]); - let (header, aligns): (Vec<_>, Vec<_>) = columns.into_iter().unzip(); - - // First pass to prepare columns widths and content. - let mut lines = Vec::new(); - let mut widths = header.iter().copied().map(str::len).collect::<Vec<_>>(); - for i in 0..ids.len() { - let id = ids[i]; - let sl = &sequences_lengths[i]; - let n = &raw_n_nodes[i]; - let np = &pruned_n_nodes[i]; - let otrip = &passed_triplets[i]; - let out = passed_outgroup[i]; - let top = passed_top[i]; - let ml = mean_lengths[i]; - let mu = mutation_rates[i]; - - let ok = "✔".green(); - let nok = "🗙".red(); - let filter = |b| (if b { &ok } else { &nok }).string(); - let mut line = vec![ - format!("{:?}", ResolvedSymbol::new(id, interner)), - format!("{sl}"), - format!("{n}"), - format!("{np}"), - if let Some(trip) = otrip { - let top = trip.topology; - if trip.resolved { - format!("{top}").yellow() - } else { - format!("≈{top}").black().italic() - } - .string() - } else { - nok.string() - }, - filter(out), - if let Some(f) = top { filter(f) } else { "-".into() }, - ]; - if max_clock_ratio.is_some() { - line.push(if let &Some((pass, sr)) = &shape_ratios[i] { - let sr = format!("{sr:.3}"); - if pass { - sr - } else { - let sr = sr.italic().red(); - format!("{nok} {sr}") - } - } else { - "-".into() - }); - } - line.extend_from_slice(&[ - if let Some(ml) = ml { format!("{ml:.3}") } else { "-".into() }, - if let Some(mu) = mu { format!("{mu:.3}") } else { "-".into() }, - ]); - for ((w, h), field) in widths.iter_mut().zip(&header).zip(&line) { - *w = max(max(*w, h.len()), terminal_width(field)); - } - lines.push(line); + + println!(" mean branch length accross trees: l = {global_mean}"); + let mut triplets = Vec::new(); + for i in included { + let local = local_triplets[i].take().unwrap(); // Exists since included. + let detail = &mut details[i]; + let local_mean = detail.mean_lengths.total.unwrap(); + let relative_mutation_rate = local_mean / global_mean; + detail.mutation_rate = Some(relative_mutation_rate); + triplets.push(GeneTriplet { local, relative_mutation_rate }); } - // Second pass to sort by tree name. - lines.sort_unstable_by(|a, b| natural_lexical_cmp(&a[0], &b[0])); + triplets +} - // Third pass to actually display the table. - // Header line. - eprintln!( - "|{}", - header - .iter() - .zip(&widths) - .fold(String::new(), |mut s, (&h, &w)| { - write!(s, "{}|", justify(h, w + 2, " ", '^')).unwrap(); - s - }) - ); - // Separator line. - eprintln!( - "|{}|", - widths - .iter() - .map(|w| format!("{:-^a$}", "", a = w + 2)) - .collect::<Vec<String>>() - .join("+") +fn write_detail(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { + let path = output.join(out::DETAIL); + println!( + "Write full trees analysis/filtering detail to {}.", + format!("{}", path.display()).blue() ); - // Data lines. - for line in lines { - eprintln!( - "|{}", - line.iter() - .zip(&aligns) - .zip(&widths) - .fold(String::new(), |mut s, ((f, &a), &w)| { - write!(s, " {} |", justify(f, w, " ", a)).unwrap(); - s - }) - ); - } + let mut file = File::create(path)?; + writeln!(file, "{:#}", json!(details))?; + Ok(()) +} - //---------------------------------------------------------------------------------------------- - eprintln!("\nSummary:"); +fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config) { + println!("\nSummary:"); let s = |n| if n > 1 { "s" } else { "" }; - let n = passed_triplets - .iter() - .map(|g| u64::from(g.is_none())) - .sum::<u64>(); - eprintln!( - " - {} triplet{} rejected (incomplete or non-monophyletic).", - n, - s(n) + let n = details.iter().map(|d| u64::from(!d.triplet.included)).sum(); + println!( + " - {n} triplet{} rejected (incomplete or non-monophyletic).", + s(n), ); + if let Some(min) = config.unresolved_length { - let n = passed_triplets + let n = details .iter() - .filter_map(|g| g.as_ref().map(|g| u64::from(!g.resolved))) - .sum::<u64>(); - eprintln!( - " - {} triplet{} considered unresolved (internal branch length ⩽ {min}).", - n, - s(n) + .filter_map(|d| d.triplet.analysis.as_ref().map(|a| u64::from(!a.resolved))) + .sum(); + println!( + " - {n} triplet{} considered unresolved (internal branch length ⩽ {min}).", + s(n), ); } - let n = passed_outgroup + + let n = details .iter() - .map(|pass| u64::from(!pass)) - .sum::<u64>(); - eprintln!( - " - {} outgroup{} rejected (empty or non-monophyletic).", - n, + .map(|d| u64::from(!d.outgroup.included)) + .sum(); + println!( + " - {n} outgroup{} rejected (empty or non-monophyletic).", s(n) ); - let n = passed_top + + let n = details .iter() - .map(|&pass| if let Some(p) = pass { u64::from(!p) } else { 1 }) + .map(|d| d.top.as_ref().map_or(1, |top| (!top.included).into())) .sum::<u64>(); - eprintln!( - " - {} tree top{} rejected (non-root LCA(triplet, outgroup){}).", - n, + println!( + " - {n} tree top{} rejected (non-root LCA(triplet, outgroup){}).", s(n), if config.filters.triplet_other_monophyly { " or non-monophyletic (triplet, other)" @@ -489,73 +434,34 @@ fn run() -> Result<(), Error> { "" } ); - if let Some(max) = max_clock_ratio { - let n = shape_ratios + + if let Some(max) = config.filters.max_clock_ratio { + let n = details .iter() - .filter_map(|o| o.map(|(pass, _)| u64::from(!pass))) - .sum::<u64>(); - eprintln!( - " - {} tree shape{} rejected \ + .map(|d| u64::from(!d.included_geometry)) + .sum(); + println!( + " - {n} tree shape{} rejected \ (triplet/outgroup imbalance larger than {max} times the average).", - n, s(n) ); } - let n = included.iter().map(|o| u64::from(o.is_some())).sum::<u64>(); - eprintln!(" => {} tree{} kept for analysis.", n, s(n)); - //============================================================================== - separator(); - - // Reduce number of lines in the "dataframe" to only focus on kept trees. - let mut kept_ids = Vec::new(); - let mut triplets = Vec::new(); - for i in 0..ids.len() { - if included[i].is_some() { - let local = passed_triplets[i].take().unwrap(); - let &relative_mutation_rate = mutation_rates[i].as_ref().unwrap(); - - // Upgrade local triplets to extended versions, aware of the whole dataset. - let triplet = GeneTriplet { local, relative_mutation_rate }; - - kept_ids.push(ids[i]); - triplets.push(triplet); - } - } - drop(passed_triplets); // Has been consumed into. + let n = included.len() as u64; + println!(" => {n} tree{} kept for analysis.", s(n)); +} - eprintln!("Starting point: {:#?}", config.search.init_parms); +fn learn(triplets: &[GeneTriplet], config: &Config) -> Result<(), Error> { + println!("\nLearning:"); let parms = &config.search.init_parms; - eprintln!("\nInitial ln-likelihood:"); - #[cfg(test)] - let display_tree_lnl_detail = |parms| { - for (&id, trip) in kept_ids.iter().zip(triplets.iter()) { - eprintln!( - " {}\t{}", - trip.likelihood(parms).ln(), - interner.resolve(id).unwrap(), - ); - } - }; - let lnl = ln_likelihood(&triplets, parms); - #[cfg(test)] - display_tree_lnl_detail(parms); - eprintln!("total: {lnl}"); - - eprintln!("\nOptimizing ln-likelihood:"); - let (opt, opt_lnl) = optimize_likelihood(&triplets, parms, &config.search)?; - - eprintln!("\nOptimized ln-likelihood:"); - #[cfg(test)] - display_tree_lnl_detail(&opt); - eprintln!("total: {opt_lnl}"); - - eprintln!("\nEnding point: {opt:#?}\n"); + let (opt, opt_lnl) = optimize_likelihood(triplets, parms, &config.search)?; + println!("Optimized ln-likelihood: {opt_lnl}"); + println!("Optimized parameters: {opt:#?}\n"); // Smoke test: // TODO: turn into actual testing by including 'official' example data. - let r = ln_likelihood(&triplets, &opt); + let r = ln_likelihood(triplets, &opt); let t = opt_lnl; assert!( float_eq!(t, r, abs <= 1e-6), @@ -563,20 +469,9 @@ fn run() -> Result<(), Error> { regular: {r}\n\ tensors: {t}\n" ); - Ok(()) } -// Convenience for coloring owned strings. -trait ToString { - fn string(&self) -> String; -} -impl ToString for ColoredString { - fn string(&self) -> String { - format!("{self}") - } -} - #[derive(Debug, Snafu)] #[snafu(context(suffix(Err)))] enum Error { diff --git a/src/config.rs b/src/config.rs index 1f265e451b8d39da9975326abceff74d9fb1419c..dc68e041152fe46d8f4b1e4907629e64df28a92a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -78,8 +78,8 @@ pub struct Filters { // Every section contains distinct species. pub struct Taxa { pub triplet: SpeciesTriplet, - pub(crate) outgroup: Vec<SpeciesSymbol>, - pub(crate) other: Vec<SpeciesSymbol>, + pub outgroup: Vec<SpeciesSymbol>, + pub other: Vec<SpeciesSymbol>, } // The triplet is importantly *oriented*: ((A, B), C). diff --git a/src/config/check.rs b/src/config/check.rs index d824e461b95da42eccb966dbe5f91fa821d30013..16bb02207142c12663d24c3a2cb6e522306ae931 100644 --- a/src/config/check.rs +++ b/src/config/check.rs @@ -93,8 +93,8 @@ impl Config { }) } - // The union of all sections constitutes the set of "relevant" species. - pub fn relevant_species(&self) -> impl Iterator<Item = SpeciesSymbol> + '_ { + // The union of all sections constitutes the set of "designated" species. + pub fn designated_species(&self) -> impl Iterator<Item = SpeciesSymbol> + '_ { let Taxa { triplet, outgroup, other } = &self.taxa; triplet .iter() @@ -193,20 +193,17 @@ impl BfgsConfig { }; if path.is_file() { let path = io::canonicalize(path)?; - eprintln!("Will override {:?}.", path.to_string_lossy()); + eprintln!("Will override {}.", path.display()); } else { ensure!( !path.is_dir(), - err!(("Path designates a folder: {:?}", path.to_string_lossy())) + err!(("Path designates a folder: {}", path.display())) ); if let Some(parent) = path.parent() { ensure!(parent.exists(), NoSuchFolderErr { path: parent }); } else { - return err!(( - "Path does not exist and has no parent: {:?}", - path.to_string_lossy() - )) - .fail(); + return err!(("Path does not exist and has no parent: {}", path.display())) + .fail(); } } Ok(Some(path.clone())) @@ -469,17 +466,17 @@ impl raw::Taxa { } // No duplicates should be found among the taxa explicitly given. - let mut relevant_species = HashSet::new(); + let mut designated_species = HashSet::new(); let mut new_species = move |name, section| -> Result<_, Error> { ensure!( - !relevant_species.contains(&name), + !designated_species.contains(&name), err!( ("Species {name:?} found in '{section}' section \ is given twice within the [taxa] table.") ) ); let symbol = interner.get_or_intern(name); - relevant_species.insert(name); + designated_species.insert(name); // Also record into the interner. Ok(symbol) }; diff --git a/src/gene_tree.rs b/src/gene_tree.rs index 58acfa95b429184ccf08236eae1890ee38dab760..047066dd160d98f410df1e06036f9a8c76cb7749 100644 --- a/src/gene_tree.rs +++ b/src/gene_tree.rs @@ -18,7 +18,9 @@ pub(crate) mod parse; pub mod topological_analysis; pub(crate) mod triplet; -pub use triplet::{extract_local as extract_local_triplet, Topology as TripletTopology}; +pub use triplet::{ + extract_local as extract_local_triplet, LocalGeneTriplet, Topology as TripletTopology, +}; // Alias numeric types to use semantic "units". diff --git a/src/gene_tree/topological_analysis.rs b/src/gene_tree/topological_analysis.rs index 722764722d1bce2bec43451334e592988b9c6415..df59f2ed70b9b38fffc4e52635ad59f31adf1f76 100644 --- a/src/gene_tree/topological_analysis.rs +++ b/src/gene_tree/topological_analysis.rs @@ -1,6 +1,6 @@ // Trees are included or not // depending on the species they contain -// compared to the relevant species listed in taxa config. +// compared to the species designated in taxa config. // // This requires a topological analysis // based on the species names found in the config 'Taxa' section. @@ -158,7 +158,7 @@ impl GeneTree { } impl SectionAnalysis { - pub(crate) fn lca(&self) -> Option<&SectionLca> { + pub fn lca(&self) -> Option<&SectionLca> { use SectionAnalysis as S; match self { S::AllMissing => None, diff --git a/src/gene_tree/triplet.rs b/src/gene_tree/triplet.rs index b6f4026fbc5b0fc92e161c70e5cc1247daea2857..931cc41cbb22d33a9b743d16aa15931b98b6de53 100644 --- a/src/gene_tree/triplet.rs +++ b/src/gene_tree/triplet.rs @@ -3,6 +3,8 @@ use std::fmt::{self, Display}; +use serde::Serialize; + use super::{MeanNbBases, NbBases}; use crate::{ config::SpeciesTriplet, @@ -13,12 +15,15 @@ use crate::{ }; // Matches the gene triplet topology onto the expected species topology. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Serialize, Debug, Clone, Copy, PartialEq, Eq)] #[allow(clippy::upper_case_acronyms)] pub enum Topology { - ABC, // ((A,B),C) -> Concordant triplets. - ACB, // ((A,C),B) -> The outer node is B instead of C. - BCA, // ((B,C),A) -> The outer node is A instead of C. + /// Concordant triplet: ((A, B), C). + ABC, + /// The outer node is B instead of C: ((A, C), B). + ACB, + /// The outer node is A instead of C: ((B, C), A). + BCA, } // Extract the minimal information needed from a full gene tree @@ -43,7 +48,7 @@ pub struct LocalGeneTriplet { // Lengths stored here are not 'per-site' guesses, // they are a rounded integer estimate of the number of mutations. pub sequence_length: NbBases, - pub(crate) branches_lengths: [NbBases; 4], + pub branches_lengths: [NbBases; 4], pub topology: Topology, // Raise if the topology is considered sufficiently resolved // to exclude discordant scenarios. @@ -59,45 +64,52 @@ pub struct GeneTriplet { pub relative_mutation_rate: f64, // Known in the paper as "alpha_i". } -// Assuming the gene tree has been correctly checked, -// match it to the associated topology. +// Match the gene tree to the associated topology. // Refer to the species topology as ((a, b), c) // and to the actual gene tree topology as ((u, v), w). +// Abort with None if the gene tree triplet +// is either incomplete or paraphyletic. #[allow(clippy::many_single_char_names)] pub fn extract_local( gtree: &GeneTree, analysis: &TopologicalAnalysis, species_triplet: &SpeciesTriplet, unresolved_length: Option<MeanNbBases>, -) -> LocalGeneTriplet { +) -> Option<LocalGeneTriplet> { use Node as N; - let err = || panic!("Did invalid trees pass the filter?"); let seqlen = gtree.sequence_length; // Extract triplet ancestor. - let SectionAnalysis::AllFound(SectionLca { id: lca_id, .. }) = analysis.triplet else { - err() + let SectionAnalysis::AllFound(SectionLca { id: lca_id, ref paraphyletic }) = analysis.triplet + else { + return None; }; + if !paraphyletic.is_empty() { + return None; + } + let nodes = >ree.tree.nodes; let N::Internal(lca) = &nodes[lca_id] else { - err(); + unreachable!() }; // Use it to extract triplet nodes, regardless of contingent tree ordering. let ([N::Internal(uv), N::Terminal(w)] | [N::Terminal(w), N::Internal(uv)]) = lca.children.map(|i| &nodes[i]) else { - err() + unreachable!() }; let [u, v] = uv.children.map(|i| { - let N::Terminal(n) = &nodes[i] else { err() }; + let N::Terminal(n) = &nodes[i] else { + unreachable!() + }; n }); // Compare to the species topology to figure the order. let [a, b, c] = species_triplet.as_array(); let uv = nb_mutations(uv.branch, seqlen); - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: { // Reorganize terminal branches lengths so they match species order. let ids_lens = [u, v, w].map(|n| (n.payload, n.branch)); @@ -121,12 +133,12 @@ pub fn extract_local( } else if w == a { T::BCA } else { - err() + unreachable!() } }, sequence_length: seqlen, resolved: if let Some(ul) = unresolved_length { uv > ul } else { true }, - } + }) } impl Topology { @@ -177,6 +189,32 @@ mod tests { assert_eq!(expected, actual); }; + //------------------------------------------------------------------------------------------ + // Excluded triplets. + + // Incomplete. + // + // │ + // ┌──0──┐ + // │ │ + // ┌─1─┐ │ + // 2 3 4 + // T X V + check("((T:2,X:3):1,V:4):0;", None); + + // Paraphyletic. + // + // │ + // ┌──0──┐ + // │ │ + // ┌──1──┐ │ + // │ │ │ + // ┌─2─┐ │ │ + // 3 4 5 6 + // T U X V + check("(((T:3,U:4):2,X:5):1,V:6):0;", None); + + //------------------------------------------------------------------------------------------ // Tree with only a triplet. // // │ @@ -187,36 +225,36 @@ mod tests { // T U V check( "((T:2,U:3):1,V:4):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [20, 30, 40, 10], sequence_length, resolved: true, topology: T::ABC, - }, + }), ); // Considered unresolved. for (uv, d) in [(0., 0), (0.03, 0), (0.05, 1) /* ⚠ Rounds to 1 ⚠ */] { check( &format!("((T:2,U:3):{uv},V:4):0;"), - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [20, 30, 40, d], sequence_length, resolved: false, topology: T::ABC, - }, + }), ); } // Considered resolved. let uv = 0.06; check( &format!("((T:2,U:3):{uv},V:4):0;"), - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [20, 30, 40, 1], sequence_length, resolved: true, topology: T::ABC, - }, + }), ); //------------------------------------------------------------------------------------------ @@ -230,36 +268,36 @@ mod tests { // V T U check( "(V:1,(T:3,U:4):2):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [30, 40, 10, 20], sequence_length, resolved: true, topology: T::ABC, - }, + }), ); // Considered unresolved. for (uv, d) in [(0., 0), (0.03, 0), (0.05, 1) /* ⚠ Rounds to 1 ⚠ */] { check( &format!("(V:1,(T:3,U:4):{uv}):0;"), - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [30, 40, 10, d], sequence_length, resolved: false, topology: T::ABC, - }, + }), ); } // Considered resolved. let uv = 0.06; check( &format!("(V:1,(T:3,U:4):{uv}):0;"), - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [30, 40, 10, 1], sequence_length, resolved: true, topology: T::ABC, - }, + }), ); // @@ -271,12 +309,12 @@ mod tests { // V U T check( "(V:1,(U:3,T:4):2):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [40, 30, 10, 20], sequence_length, resolved: true, topology: T::ABC, - }, + }), ); // │ @@ -287,12 +325,12 @@ mod tests { // U T V check( "((U:2,T:3):1,V:4):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [30, 20, 40, 10], sequence_length, resolved: true, topology: T::ABC, - }, + }), ); //------------------------------------------------------------------------------------------ @@ -306,12 +344,12 @@ mod tests { // T V U check( "(T:1,(V:3,U:4):2):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [10, 40, 30, 20], sequence_length, resolved: true, topology: T::BCA, - }, + }), ); // @@ -323,12 +361,12 @@ mod tests { // U V T check( "(U:1,(V:3,T:4):2):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [40, 10, 30, 20], sequence_length, resolved: true, topology: T::ACB, - }, + }), ); // @@ -340,12 +378,12 @@ mod tests { // U V T check( "((U:2,V:3):1,T:4):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [40, 20, 30, 10], sequence_length, resolved: true, topology: T::BCA, - }, + }), ); // @@ -357,12 +395,12 @@ mod tests { // T V U check( "((T:2,V:3):1,U:4):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [20, 40, 30, 10], sequence_length, resolved: true, topology: T::ACB, - }, + }), ); //------------------------------------------------------------------------------------------ @@ -382,12 +420,12 @@ mod tests { // O P Q X Y T U V Z check( "(((O:3,(P:5,Q:6):4):2,X:7):1,(Y:9,(((T:13,U:14):12,V:15):11,Z:16):10):8):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [130, 140, 150, 120], sequence_length, resolved: true, topology: T::ABC, - }, + }), ); // Discordant. @@ -406,12 +444,12 @@ mod tests { // O P Q X Y U V T Z check( "(((O:3,(P:5,Q:6):4):2,X:7):1,(Y:9,(((U:13,V:14):12,T:15):11,Z:16):10):8):0;", - LocalGeneTriplet { + Some(LocalGeneTriplet { branches_lengths: [150, 130, 140, 120], sequence_length, resolved: true, topology: T::BCA, - }, + }), ); } } diff --git a/src/genes_forest.rs b/src/genes_forest.rs index 0411afc203968d4299a2579087217ccdf049bab3..5d3303210ffd2041b1e97d9364d4f404f9a225a9 100644 --- a/src/genes_forest.rs +++ b/src/genes_forest.rs @@ -14,7 +14,6 @@ use crate::{interner::TreeSymbol, GeneTree}; pub(crate) mod filter; pub mod parse; -// Bound to the input file lifetime. pub struct GenesForest { trees: Vec<GeneTree>, ids: Vec<TreeSymbol>, diff --git a/src/genes_forest/filter.rs b/src/genes_forest/filter.rs index 475c37d6072dca76768b8bce6d3dd1868b74e599..9c0d57de16bf852234a29161779d492d60bc4db0 100644 --- a/src/genes_forest/filter.rs +++ b/src/genes_forest/filter.rs @@ -7,36 +7,66 @@ use crate::{ gene_tree::{topological_analysis::SectionAnalysis, BranchLength}, + topological_analysis::TreeTop, GeneTree, TopologicalAnalysis, }; impl TopologicalAnalysis { pub fn is_included(&self, triplet_other_monophyly: bool) -> bool { - use SectionAnalysis as S; + if !self.triplet.is_included_as_triplet() { + return false; + } + + if !self.outgroup.is_included_as_outgroup() { + return false; + } + + let Some(top) = &self.top else { + panic!("Treetop should exist if both triplet and outgroup are included.") + }; + + if !top.is_included(triplet_other_monophyly) { + return false; + } + + true + } +} +impl SectionAnalysis { + pub(crate) fn is_included_as_triplet(&self) -> bool { // Reject incomplete triplets. - let S::AllFound(lca) = &self.triplet else { + let Self::AllFound(lca) = &self else { return false; }; + // Reject non-monophyletic triplets. if !lca.paraphyletic.is_empty() { return false; } + true + } + + pub(crate) fn is_included_as_outgroup(&self) -> bool { // Reject empty outgroups. - let Some(lca) = self.outgroup.lca() else { + let Some(lca) = self.lca() else { return false; }; + // Reject non-monophyletic outgroup. if !lca.paraphyletic.is_empty() { return false; } + true + } +} + +impl TreeTop { + pub(crate) fn is_included(&self, triplet_other_monophyly: bool) -> bool { // Reject non-root LCA(triplet, outgroup). - let Some(top) = &self.top else { - panic!("Treetop should exist if triplet and outgroup are both monophyletic.") - }; - if !top.external.is_empty() { + if !self.external.is_empty() { return false; } @@ -44,11 +74,8 @@ impl TopologicalAnalysis { if triplet_other_monophyly { // Rejection occurs if and only if 'other' species // branch from the 'outgroup' side of LCA(triplet, outgroup) subtree. - let Some(internal) = &top.internal else { - panic!( - "Internal species should exist \ - if triplet and outgroup are both monophyletic." - ); + let Some(internal) = &self.internal else { + return false; // Or if one is in direct lineage with the other. }; if !internal.outgroup_side().is_empty() { return false; diff --git a/src/learn.rs b/src/learn.rs index e332d32a55c3799b7438838b7ed1e004492863a4..97b34dea2c05b92ed327efef41233147f9f8fae6 100644 --- a/src/learn.rs +++ b/src/learn.rs @@ -57,7 +57,7 @@ pub fn optimize_likelihood( // If this happens, perform the optimization // on a smaller batch of the data first until we get finite likelihood. let sc: Scores<f64> = (&scores(&p)).into(); - eprintln!( + println!( "The chosen starting point yields non-finite log-likelihood ({first_lnl}) \ on the whole dataset ({n_triplets} triplets):\n{sc}\n{start:?}", ); @@ -83,7 +83,7 @@ pub fn optimize_likelihood( n_samples > 0, NonFiniteLikelihoodErr { lnl: first_lnl, parms: start.clone() } ); - eprintln!( + println!( "-- Try obtaining finite log-likelihood \ with the {n_samples} first triplets." ); @@ -92,21 +92,21 @@ pub fn optimize_likelihood( let lnl = ln_likelihood_tensors(&sub_x, &scores(&p)).to_double(); n_eval += 1; if !lnl.is_finite() { - eprintln!("---- Failure: obtained log-likelihood: {lnl}."); + println!("---- Failure: obtained log-likelihood: {lnl}."); // Reduce the data again. n_samples = reduce(n_samples); continue 'x; } // With this working (sub_X, P), perform a learning pass // to produce a (hopefully better) candidate P. - eprintln!( + println!( "---- Success: obtained log-likelihood: {lnl}.\n\ -- Learn on this subsample with simple gradient descent." ); let f = |p: &Tensor| -ln_likelihood_tensors(&sub_x, &scores(p)); match search.init_descent.minimize(f, &p, 0) { Err(e) => { - eprintln!("---- Learning failed:\n{e}"); + println!("---- Learning failed:\n{e}"); n_samples /= 2; continue 'x; } @@ -116,24 +116,24 @@ pub fn optimize_likelihood( p = opt.best_vars().copy(); let sc: Scores<f64> = (&scores(&p)).into(); let pm: Parameters<f64> = (&sc).into(); - eprintln!( + println!( "---- Success: parameters learned on the subsample:\n{sc}\n{pm:?}" ); break 'x; } } } - eprintln!("-- Try with this new starting point."); + println!("-- Try with this new starting point."); let lnl = ln_likelihood_tensors(&x, &scores(&p)).to_double(); n_eval += 1; if !lnl.is_finite() { - eprintln!( + println!( "---- Failure: obtained non-finite log-likelihood again ({lnl}). \ Try subsampling again from the new starting point." ); continue 'p; } - eprintln!( + println!( "---- Success: obtained finite log-likelihood on the whole dataset ({lnl}).\n\ -- Start learning from this new starting point." ); diff --git a/src/lib.rs b/src/lib.rs index 7630d75620dfe88485116dc74fd27468208d4c0a..ced4954e68919311934d4eda0e9efdf989bca24c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,15 +8,15 @@ pub mod learn; mod lexer; mod model; mod optim; +pub mod output; mod tree; -mod output; pub use config::{Config, Filters}; pub use gene_tree::{ extract_local_triplet, topological_analysis::{self, TopologicalAnalysis}, triplet::GeneTriplet, - BranchLength, GeneTree, + BranchLength, GeneTree, LocalGeneTriplet, }; pub use genes_forest::{filter::imbalance, GenesForest}; pub use learn::optimize_likelihood; diff --git a/src/model/scenarios.rs b/src/model/scenarios.rs index cdb9e5c1719289a080f158f5827ca8a925f324b2..9785cc21192cc3814d060bb368a9f62b59defe68 100644 --- a/src/model/scenarios.rs +++ b/src/model/scenarios.rs @@ -76,7 +76,6 @@ pub(crate) fn n_scenarios(n_gf_times: usize) -> usize { } impl Scenario { - pub(crate) fn iter(n_gf_times: usize) -> impl Iterator<Item = Self> { use Scenario as S; iter::once(S::NoEvent) diff --git a/src/optim/bfgs.rs b/src/optim/bfgs.rs index eeb7c9ee0e8fac19b113e24d6174dd4c9e9af2b2..ed71ade85462409b9f4563e5fbaca4fb50eac104 100644 --- a/src/optim/bfgs.rs +++ b/src/optim/bfgs.rs @@ -287,7 +287,7 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { for vec in ["var", "grad", "dir"] { if let Some(varnames) = &self.cf.variable_names { for name in varnames { - header.push(format!("{vec}_{name}")) + header.push(format!("{vec}_{name}")); } } else { for i in 0..self.n_vars { diff --git a/src/output.rs b/src/output.rs index 8296ae4c60bb6f383f12c13135c5534e07600e69..585d6fa6e5cb9fae986dae76ddb7366c269eab6d 100644 --- a/src/output.rs +++ b/src/output.rs @@ -1,4 +1,4 @@ // Use this module to specify the possible output(s) of aphid. mod config; - +pub mod detail; diff --git a/src/output/detail.rs b/src/output/detail.rs new file mode 100644 index 0000000000000000000000000000000000000000..0228f7ba14ba905b3e6fbcbe862ee15fd9eaa3f5 --- /dev/null +++ b/src/output/detail.rs @@ -0,0 +1,279 @@ +// Use this structure to collect all per-tree information +// susceptible to interest user after an aphid run. +// Export as a json file. + +use arrayvec::ArrayVec; +use serde::Serialize; + +use crate::{ + config::Taxa, + gene_tree::{NbBases, TripletTopology}, + interner::{Interner, SpeciesSymbol}, + topological_analysis::{SectionAnalysis, TreeTop}, + BranchLength, LocalGeneTriplet, TopologicalAnalysis, +}; + +/// All information used or produced by aphid +/// regarding one particular gene tree. +#[derive(Serialize, Default)] +pub struct Tree<'i> { + // Borrow from species/trees identifiers. + /// Tree identifier, as given in input. + pub id: &'i str, + /// Sequence length. + pub n_bases: NbBases, + /// Number of nodes in the input tree. + pub n_nodes_raw: usize, + + /// Number of nodes after only species of interest have been kept. + pub n_nodes_pruned: usize, + + /// Status of the focal triplet in this tree. + pub triplet: Triplet<'i>, + + /// Status of the designated outgroup in this tree. + pub outgroup: Outgroup<'i>, + + /// Status of this tree's most ancestral nodes. + /// Undefined if either no triplet species or no outgroup species were found. + pub top: Option<Top<'i>>, + + /// Raised if the tree passed the topology filter. + pub included_topology: bool, + + /// Mean branches lengths, + /// undefined if the none of the species set + /// they are supposed to be calculated over is found within the tree. + pub mean_lengths: MeanLengths, + + /// "Absolute" ratio of mean triplet length + /// and mean length of designated 'outgroup' and 'other' species: + /// always superior to 1 to measure 'imbalance', + /// regardless which of the numerator or denominator is greater. + pub local_shape: Option<f64>, + + /// Raised if the tree passed the geometry filter. + pub included_geometry: bool, + + /// Estimated mutation rate for this tree. + /// (No estimate if the tree was excluded from analysis.) + pub mutation_rate: Option<f64>, +} + +#[derive(Serialize, Default)] +pub struct Triplet<'i> { + /// LCA(triplet): the most recent ancestor + /// of the focal triplet species found in this tree. + /// Undefined if all triplet species were missing. + pub lca: Option<usize>, + /// Triplet species not found within this tree. + pub missing: ArrayVec<&'i str, 3>, + /// Paraphyletic species found within this tree: + /// these descend from LCA(triplet) but don't belong to the focal triplet. + pub paraphyletic: Vec<&'i str>, + + /// Further information calculated iif the triplet is complete and monophyletic. + pub analysis: Option<TripletAnalysis>, + + /// Raise unless the tree should be excluded from analysis + /// based on this triplet topology. + pub included: bool, +} + +#[derive(Serialize)] +pub struct TripletAnalysis { + /// Topology within this tree, + /// assuming the reference topology was 'ABC' ~ '((A, B), C)'. + pub topology: TripletTopology, + + /// Estimate of the number of mutations along the triplet branches. + /// The internal branch length 'd' either represent: + /// - 'ab' in ((:a, :b):ab, :c) for topology 'ABC'. + /// - 'ac' in (:b, (:a, :c):ac) for topology 'ACB'. + /// - 'bc' in (:a, (:c, :b):bc) for topology 'BCA'. + pub branches_lengths: TripletLengths, + + /// True if the topology is considered sufficiently resolved + /// to exclude discordant scenarios from likelihood calculations. + pub resolved: bool, +} + +#[derive(Serialize)] +pub struct TripletLengths { + pub a: NbBases, + pub b: NbBases, + pub c: NbBases, + pub d: NbBases, +} + +#[derive(Serialize, Default)] +pub struct Outgroup<'i> { + /// LCA(outgroup): the ost recent ancestor + /// of the designated outgroup species found in this tree. + /// Undefined if all outgroup species were missing. + pub lca: Option<usize>, + /// Outgroup species not found within this tree. + pub missing: Vec<&'i str>, + /// Paraphyletic species found within this tree: + /// these descend from LCA(outgroup) but don't belong to the designated outgroup. + pub paraphyletic: Vec<&'i str>, + + /// Raise unless the tree should be excluded from analysis + /// based on this outgroup topology. + pub included: bool, +} + +#[derive(Serialize)] +pub struct Top<'i> { + /// LCA(top): the most recent ancestor + /// of LCA(triplet) and LCA(outgroup) found in this tree. + pub lca: usize, + /// Species not descending from LCA(top). + /// If any, then LCA(top) is not the root of the tree. + pub external: Vec<&'i str>, + /// Species descending from LCA(top), + /// but neither from LCA(triplet) or LCA(outgroup). + /// Only defined if there is no direct lineage between LCA(triplet) and LCA(outgroup). + pub internal: Option<Internal<'i>>, + + /// Raise unless the tree should be excluded from analysis + /// based on this tree top topology. + pub included: bool, +} + +#[derive(Serialize)] +pub struct Internal<'i> { + /// Species branching between LCA(top) and LCA(triplet). + pub triplet: Vec<&'i str>, + /// Species branching between LCA(top) and LCA(outgroup). + pub outgroup: Vec<&'i str>, +} + +#[derive(Serialize, Default)] +pub struct MeanLengths { + /// Calculated over all species of interest found in this tree. + pub total: Option<BranchLength>, + + /// Calculated over the focal triplet species found. + pub triplet: Option<BranchLength>, + + /// Calculated over the designated outgroup species found + /// + the species designated as 'other'. + pub outgroup_other: Option<BranchLength>, +} + +//================================================================================================== +// Ease construction from internal types. + +pub fn resolve_topology<'i>( + top: &TopologicalAnalysis, + taxa: &Taxa, + triplet_other_monophyly: bool, + interner: &'i Interner, +) -> (Triplet<'i>, Outgroup<'i>, Option<Top<'i>>) { + let TopologicalAnalysis { triplet, outgroup, top } = top; + + let triplet = Triplet::resolve(triplet, taxa.triplet.iter(), interner); + let outgroup = Outgroup::resolve(outgroup, taxa.outgroup.iter(), interner); + let top = top + .as_ref() + .map(|top| Top::resolve(top, triplet_other_monophyly, interner)); + + (triplet, outgroup, top) +} + +impl<'i> Triplet<'i> { + pub fn resolve<'int: 'i>( + sa: &SectionAnalysis, + triplet: impl Iterator<Item = SpeciesSymbol>, + interner: &'int Interner, + ) -> Self { + use SectionAnalysis as SA; + let resolve = |s| interner.resolve(s).unwrap(); + let (lca, missing, paraphyletic) = match sa { + SA::AllMissing => (None, triplet.map(resolve).collect(), Vec::new()), + SA::MissingSome(missing, lca) => ( + Some(lca.id), + missing.iter().copied().map(resolve).collect(), + lca.paraphyletic.iter().copied().map(resolve).collect(), + ), + SA::AllFound(lca) => ( + Some(lca.id), + ArrayVec::new(), + lca.paraphyletic.iter().copied().map(resolve).collect(), + ), + }; + Self { + lca, + missing, + paraphyletic, + analysis: None, + included: sa.is_included_as_triplet(), + } + } +} + +impl<'i> Outgroup<'i> { + pub fn resolve<'int: 'i, 's>( + sa: &'s SectionAnalysis, + outgroup: impl Iterator<Item = &'s SpeciesSymbol>, + interner: &'int Interner, + ) -> Self { + use SectionAnalysis as SA; + let resolve = |&s| interner.resolve(s).unwrap(); + let (lca, missing, paraphyletic) = match sa { + SA::AllMissing => (None, outgroup.map(resolve).collect(), Vec::new()), + SA::MissingSome(missing, lca) => ( + Some(lca.id), + missing.iter().map(resolve).collect(), + lca.paraphyletic.iter().map(resolve).collect(), + ), + SA::AllFound(lca) => ( + Some(lca.id), + Vec::new(), + lca.paraphyletic.iter().map(resolve).collect(), + ), + }; + Self { + lca, + missing, + paraphyletic, + included: sa.is_included_as_outgroup(), + } + } +} + +impl<'i> Top<'i> { + pub fn resolve<'int: 'i, 's>( + top: &'s TreeTop, + triplet_other_monophyly: bool, + interner: &'int Interner, + ) -> Self { + let resolve = |&s| interner.resolve(s).unwrap(); + let TreeTop { lca, ref external, ref internal } = *top; + Self { + lca, + external: external.iter().map(resolve).collect(), + internal: internal.as_ref().map(|i| Internal { + triplet: i.triplet_side().iter().map(resolve).collect(), + outgroup: i.outgroup_side().iter().map(resolve).collect(), + }), + included: top.is_included(triplet_other_monophyly), + } + } +} + +//-------------------------------------------------------------------------------------------------- + +impl TripletAnalysis { + pub fn new(loc: &LocalGeneTriplet) -> Self { + let LocalGeneTriplet { + topology, branches_lengths: [a, b, c, d], resolved, .. + } = *loc; + Self { + topology, + branches_lengths: TripletLengths { a, b, c, d }, + resolved, + } + } +} diff --git a/src/tree/node_to_leaves.rs b/src/tree/node_to_leaves.rs index 96a29faf47c27887dcc5ff4aab164d5e5c369237..fcd801f8efdf8ffdaaf3eebb697e58daf272f703 100644 --- a/src/tree/node_to_leaves.rs +++ b/src/tree/node_to_leaves.rs @@ -48,7 +48,6 @@ impl<B, IN, TN> Tree<B, IN, TN> { { self.node_to_leaves(self.root(), init, accumulator, terminator) } - } // Accumulating iterator from root to leaves.