diff --git a/src/bin/aphid/main.rs b/src/bin/aphid/main.rs index 81f88b098b8cd6e0de9bdfd5758e1c5bca80d8e6..5d6dddfcc1d7618f468fc711ec3ccec2b8d1dc73 100644 --- a/src/bin/aphid/main.rs +++ b/src/bin/aphid/main.rs @@ -175,9 +175,7 @@ fn run() -> Result<(), AphidError> { &config.taxa.triplet, config.unresolved_length, ); - if config.unresolved_length.is_some() && trip.topology.is_none() { - n_unresolved += 1; - } + n_unresolved += u64::from(!trip.resolved); trip }); @@ -314,11 +312,13 @@ fn run() -> Result<(), AphidError> { format!("{n}"), format!("{np}"), if let Some(trip) = otrip { - if let Some(top) = trip.topology { - format!("{top}") + let top = trip.topology; + if trip.resolved { + format!("{top}").yellow() } else { - "<?>".yellow().string() + format!("≈{top}").black().italic() } + .string() } else { nok.string() }, @@ -402,7 +402,7 @@ fn run() -> Result<(), AphidError> { if let Some(min) = config.unresolved_length { let n = passed_triplets .iter() - .filter_map(|g| g.as_ref().map(|g| u64::from(g.topology.is_none()))) + .filter_map(|g| g.as_ref().map(|g| u64::from(!g.resolved))) .sum::<u64>(); eprintln!( " - {} triplet{} considered unresolved (internal branch length ⩽ {min}).", diff --git a/src/config/check.rs b/src/config/check.rs index 18ddb66a18b77a46cb008c1aeb45698442388967..0a27096d892fcf28b222fde57a7bdc8ea318c386 100644 --- a/src/config/check.rs +++ b/src/config/check.rs @@ -37,14 +37,20 @@ pub struct Config { pub filters: Filters, // If set, when the internal triplet branch length "d" - // is (non-strictly) less than this value, - // consider that its topology is *unresolved*. - // In this situation, every scenario contributes to the likelihood, - // because the topology "maybe matches", - // and it is considered that "observed d = 0". + // is less or equal to this value, + // consider that its topology is not resolved enough + // to exclude discordant scenarios. + // In this situation, + // every scenario contributes to the likelihood + // instead of only the concordant ones. + // The possible internal branch discordance + // between 'actual' and 'expected' length + // is neglected because the the actual length is small. + // For this reason, *enforce* that it be small. // The value is given in *mutation* units, so branch length × sequence length. pub unresolved_length: Option<MeanNbBases>, } +const MAX_UNRESOLVED_LENGTH: f64 = 0.5; #[derive(Debug)] pub struct Filters { @@ -114,6 +120,14 @@ impl Config { cannot be negative. Received 'unresolved_length = {ul}'.") ) } + if ul > MAX_UNRESOLVED_LENGTH { + err!( + ("The minimum internal branch length for tree considered resolved \ + is assumed to be small, \ + so the program forbids that it be superior to {MAX_UNRESOLVED_LENGTH}. \ + Received 'unresolved_length = {ul}'.") + ) + } } // Most checks implemented within `TryFrom` trait. diff --git a/src/gene_tree/triplet.rs b/src/gene_tree/triplet.rs index 71830ed05bb120c58470115db271e9938174075d..8565b4ceea2e3a4f07055bd09d100d72f3a1296b 100644 --- a/src/gene_tree/triplet.rs +++ b/src/gene_tree/triplet.rs @@ -27,32 +27,28 @@ pub enum Topology { #[allow(clippy::module_name_repetitions)] #[cfg_attr(test, derive(PartialEq, Debug))] pub struct LocalGeneTriplet { - // /!\ - // Rounded branches lengths, in canonicalized order: [{a, b}, d, {c}], - // regardless of the contingent input gene tree ordering. - // - // ┌──┴──┐ OR ┌──┴──┐ - // │d │ │ │d - // ┌─┴─┐ │c │c ┌─┴─┐ - // │a │b │ │ │a │b + // Rounded branches lengths, + // in an order making it easy to map + // against species in the reference phylogenetic triplet. + // If the triplet is ((A, B), C), + // the canonical order is: [a, b, c, 'd'], + // with 'd' the length of the "internal branch", + // regardless of the topology actually observed. + // This topology is stored aside to disambiguate, + // as either 'ABC', 'ACB', 'BCA'. + // Here are examples of lengths observations encoding: + // ((:a, :b):ab, :c) → [a, b, c, ab] + ABC + // (:b, (:a, :c):ac) → [a, b, c, ac] + ACB + // (:a, (:c, :b):bc) → [a, b, c, bc] + BCA // // Lengths stored here are not 'per-site' guesses, // they are a rounded integer estimate of the number of mutations. - - // If considered too small, the length 'd' is forced to zero, - // and then the topology is considered 'unresolved': - // - // ┌──┼──┐ d = 0 - // │a │b │c - // - // In this situation, all scenarios are considered possible. - - // Notice that d comes *before* c, and the a/b order is arbitrary. - // /!\ - pub(crate) branches_lengths: [NbBases; 4], // [a, b, d, c] pub sequence_length: NbBases, - // Concordance with respect to the species triplet. - pub topology: Option<Topology>, + pub(crate) branches_lengths: [NbBases; 4], + pub topology: Topology, + // Raise if the topology is considered sufficiently resolved + // to exclude discordant scenarios. + pub resolved: bool, } // Extend the above type with data @@ -77,6 +73,7 @@ pub fn extract_local( ) -> LocalGeneTriplet { use Node as N; let err = || panic!("Did invalid trees pass the filter?"); + let seqlen = gtree.sequence_length; // Extract triplet ancestor. let SectionAnalysis::AllFound(SectionLca { id: lca_id, .. }) = analysis.triplet else { @@ -98,29 +95,24 @@ pub fn extract_local( n }); - // Possibly, the internal branch length is too small to be considered positive, - // and the topology to be considered resolved. - let resolved = if let Some(min) = unresolved_length { - min < nb_mutations(uv.branch, gtree.sequence_length) - } else { - true - }; - + // Compare to the species topology to figure the order. + let [a, b, c] = species_triplet.as_array(); + let uv = nb_mutations(uv.branch, seqlen); LocalGeneTriplet { branches_lengths: { - // Extract branches lengths in canonicalized nodes order. - let [u, v, w] = [u, v, w].map(|n| n.branch); - let uv = if resolved { uv.branch } else { 0. }; + // Reorganize terminal branches lengths so they match species order. + let ids_lens = [u, v, w].map(|n| (n.payload, n.branch)); + let abc = [a, b, c].map(|s| ids_lens.iter().find(|(id, _)| *id == s).unwrap().1); + let [a, b, c] = abc.map(|l| nb_mutations(l, seqlen)); + // And let the internal branch be whatever internal branch (ab, ac, bc), + // its exact meaning being specified by the topology calculated hereafter. // Convert per-site estimates to total, sequence-wide estimates, // and round to an integer to match the Poisson distribution model. #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] - [u, v, uv, w].map(|l| nb_mutations(l, gtree.sequence_length).round() as NbBases) + [a, b, c, uv].map(|l| l.round() as NbBases) }, - sequence_length: gtree.sequence_length, - topology: resolved.then(|| { + topology: { use Topology as T; - // Compare to the species topology to figure the order. - let [a, b, c] = species_triplet.as_array(); // Only the outermost node is useful in this respect. let w = w.payload; if w == c { @@ -132,7 +124,9 @@ pub fn extract_local( } else { err() } - }), + }, + sequence_length: seqlen, + resolved: if let Some(ul) = unresolved_length { uv > ul } else { true }, } } @@ -199,20 +193,22 @@ mod tests { check( "((T:2,U:3):1,V:4):0;", LocalGeneTriplet { - branches_lengths: [20, 30, 10, 40], + branches_lengths: [20, 30, 40, 10], sequence_length, - topology: Some(T::ABC), + resolved: true, + topology: T::ABC, }, ); // Considered unresolved. - for uv in [0., 0.03, 0.05] { + for (uv, d) in [(0., 0), (0.03, 0), (0.05, 1) /* ⚠ Rounds to 1 ⚠ */] { check( &format!("((T:2,U:3):{uv},V:4):0;"), LocalGeneTriplet { - branches_lengths: [20, 30, 0, 40], + branches_lengths: [20, 30, 40, d], sequence_length, - topology: None, + resolved: false, + topology: T::ABC, }, ); } @@ -221,9 +217,10 @@ mod tests { check( &format!("((T:2,U:3):{uv},V:4):0;"), LocalGeneTriplet { - branches_lengths: [20, 30, 1, 40], + branches_lengths: [20, 30, 40, 1], sequence_length, - topology: Some(T::ABC), + resolved: true, + topology: T::ABC, }, ); @@ -239,20 +236,22 @@ mod tests { check( "(V:1,(T:3,U:4):2):0;", LocalGeneTriplet { - branches_lengths: [30, 40, 20, 10], + branches_lengths: [30, 40, 10, 20], sequence_length, - topology: Some(T::ABC), + resolved: true, + topology: T::ABC, }, ); // Considered unresolved. - for uv in [0., 0.03, 0.05] { + for (uv, d) in [(0., 0), (0.03, 0), (0.05, 1) /* ⚠ Rounds to 1 ⚠ */] { check( &format!("(V:1,(T:3,U:4):{uv}):0;"), LocalGeneTriplet { - branches_lengths: [30, 40, 0, 10], + branches_lengths: [30, 40, 10, d], sequence_length, - topology: None, + resolved: false, + topology: T::ABC, }, ); } @@ -261,9 +260,10 @@ mod tests { check( &format!("(V:1,(T:3,U:4):{uv}):0;"), LocalGeneTriplet { - branches_lengths: [30, 40, 1, 10], + branches_lengths: [30, 40, 10, 1], sequence_length, - topology: Some(T::ABC), + resolved: true, + topology: T::ABC, }, ); @@ -277,9 +277,10 @@ mod tests { check( "(V:1,(U:3,T:4):2):0;", LocalGeneTriplet { - branches_lengths: [30, 40, 20, 10], + branches_lengths: [40, 30, 10, 20], sequence_length, - topology: Some(T::ABC), + resolved: true, + topology: T::ABC, }, ); @@ -292,9 +293,10 @@ mod tests { check( "((U:2,T:3):1,V:4):0;", LocalGeneTriplet { - branches_lengths: [20, 30, 10, 40], + branches_lengths: [30, 20, 40, 10], sequence_length, - topology: Some(T::ABC), + resolved: true, + topology: T::ABC, }, ); @@ -310,9 +312,10 @@ mod tests { check( "(T:1,(V:3,U:4):2):0;", LocalGeneTriplet { - branches_lengths: [30, 40, 20, 10], + branches_lengths: [10, 40, 30, 20], sequence_length, - topology: Some(T::BCA), + resolved: true, + topology: T::BCA, }, ); @@ -326,9 +329,10 @@ mod tests { check( "(U:1,(V:3,T:4):2):0;", LocalGeneTriplet { - branches_lengths: [30, 40, 20, 10], + branches_lengths: [40, 10, 30, 20], sequence_length, - topology: Some(T::ACB), + resolved: true, + topology: T::ACB, }, ); @@ -342,9 +346,10 @@ mod tests { check( "((U:2,V:3):1,T:4):0;", LocalGeneTriplet { - branches_lengths: [20, 30, 10, 40], + branches_lengths: [40, 20, 30, 10], sequence_length, - topology: Some(T::BCA), + resolved: true, + topology: T::BCA, }, ); @@ -358,9 +363,10 @@ mod tests { check( "((T:2,V:3):1,U:4):0;", LocalGeneTriplet { - branches_lengths: [20, 30, 10, 40], + branches_lengths: [20, 40, 30, 10], sequence_length, - topology: Some(T::ACB), + resolved: true, + topology: T::ACB, }, ); @@ -382,9 +388,10 @@ mod tests { check( "(((O:3,(P:5,Q:6):4):2,X:7):1,(Y:9,(((T:13,U:14):12,V:15):11,Z:16):10):8):0;", LocalGeneTriplet { - branches_lengths: [130, 140, 120, 150], + branches_lengths: [130, 140, 150, 120], sequence_length, - topology: Some(T::ABC), + resolved: true, + topology: T::ABC, }, ); @@ -405,9 +412,10 @@ mod tests { check( "(((O:3,(P:5,Q:6):4):2,X:7):1,(Y:9,(((U:13,V:14):12,T:15):11,Z:16):10):8):0;", LocalGeneTriplet { - branches_lengths: [130, 140, 120, 150], + branches_lengths: [150, 130, 140, 120], sequence_length, - topology: Some(T::BCA), + resolved: true, + topology: T::BCA, }, ); } diff --git a/src/model/likelihood.rs b/src/model/likelihood.rs index 84fddea78addfbbd30a284c708c58cb3cd54e20f..cafab336fcbe0f2c397e7282d83639c06f942713 100644 --- a/src/model/likelihood.rs +++ b/src/model/likelihood.rs @@ -13,7 +13,18 @@ use super::{ scores::Scores, Parameters, }; -use crate::gene_tree::{nb_mutations, triplet::GeneTriplet}; +use crate::gene_tree::{nb_mutations, triplet::GeneTriplet, TripletTopology}; + +//-------------------------------------------------------------------------------------------------- +impl GeneTriplet { + // Decide whether contribution to the overall likelihood is relevant. + // Only include concordant scenarios, + // unless the triplet is not resolved enough. + fn contributes(&self, scenario: &Scenario) -> bool { + let triplet = &self.local; + triplet.topology == scenario.topology() || !triplet.resolved + } +} //-------------------------------------------------------------------------------------------------- macro_rules! probabilities_ils_nongf { @@ -122,16 +133,14 @@ impl GeneFlowScenario { //-------------------------------------------------------------------------------------------------- // Expected branches lengths for the triplet, according to the given scenario: -// Return only pair of short/long branches lengths (S/L). +// Return only pair of short/long branches lengths [S, L]. // // ┌──┴──┐ // d│L-S │ // ┌─┴─┐ c│L // a│S b│S │ // -// Abstract over operand types, -// not with trait bounds because they would be too intricate. -macro_rules! branches_lengths { +macro_rules! compact_branches_lengths { ($scenario:expr, $parameters:expr) => {{ use Scenario as S; let Parameters { @@ -164,71 +173,60 @@ macro_rules! branches_lengths { } }}; } + +// Expand the above compact [S, L] representation +// into the corresponding expected [a, b, c, d] order +// matching species topologies. +macro_rules! expand_lengths { + ($scenario:expr, $sl:expr) => {{ + use TripletTopology as T; + let [ref s, ref l] = $sl; + let (d, [a, b, c]) = ( + l - s, // Internal branch always has this expected value. + match $scenario.topology() { + T::ABC => [s, s, l], // Long branch is C. + T::ACB => [s, l, s], // Long branch is B. + T::BCA => [l, s, s], // Long branch is A. + }, + ); + ([a, b, c], d) + }}; +} + impl Scenario { - pub(crate) fn branches_lengths(&self, p: &Parameters<f64>) -> [f64; 2] { - branches_lengths!(self, p) + pub(crate) fn branches_lengths(&self, parms: &Parameters<f64>) -> [f64; 4] { + let sl = compact_branches_lengths!(self, parms); + let ([&a, &b, &c], d) = expand_lengths!(self, sl); + [a, b, c, d] } - pub(crate) fn branches_lengths_tensors(&self, p: &Parameters<Tensor>) -> [Tensor; 2] { - // branches_lengths!(self, p) - { - use Scenario as S; - let Parameters { - theta, - tau_1, - tau_2, - gf_times, - .. - } = p; - let gf_times = |i| &gf_times.0[i]; - match self { - S::NoEvent => [tau_1 + 0.5 * theta, tau_2 + 0.5 * theta], - S::IncompleteLineageSorting(_) => { - [tau_2 + (1. / 6.) * theta, tau_2 + (2. / 3.) * theta] - } - &S::GeneFlow(GeneFlowScenario { - time: i, - ref topology, - }) => { - use GeneFlowTopology as G; - let tau_g = tau_1 * gf_times(i); - [ - tau_g, - match topology { - G::ABC | G::CAB | G::CBA => tau_2, - G::ACB | G::BCA => tau_1, - } + 0.5 * theta, - ] - } - } - } + pub(crate) fn branches_lengths_tensor(&self, parms: &Parameters<Tensor>) -> Tensor { + let sl = compact_branches_lengths!(self, parms); + let ([a, b, c], ref d) = expand_lengths!(self, sl); + Tensor::concatenate(&[a, b, c, d], 0) } } // Probability that the given gene triplet occurs -// given the suggested [S, L] branches lengths. +// given the expected branches lengths. impl GeneTriplet { #[allow(clippy::many_single_char_names)] - fn branches_lengths_likelihood(&self, [s, l]: [f64; 2]) -> f64 { + fn branches_lengths_likelihood(&self, expected_lengths: [f64; 4]) -> f64 { // (locally match the paper notations) let alpha_i = self.relative_mutation_rate; let loc = &self.local; // Minimize .exp() evaluations within poisson density by incursing into log space. - let ln_pmf = |a, e| { - // The expected branch length is not yet a number of mutations. - let actual = u64::from(a); - let expected = alpha_i * nb_mutations(e, loc.sequence_length); - ln_poisson_density(actual, expected) - }; - - // Match every actual branch length against its own expected length. - let [a, b, d /*~ab*/, c] = loc.branches_lengths; - let pa = ln_pmf(a, s); // One short branch. - let pb = ln_pmf(b, s); // The other short branch. - let pc = ln_pmf(c, l); // The long, external branch. - let pd = ln_pmf(d, l - s); // The internal branch. - - (pa + pb + pc + pd).exp() + loc.branches_lengths + .iter() + .zip(expected_lengths.iter()) + .map(|(&a, &e)| { + // The expected branch length is not yet a number of mutations. + let actual = u64::from(a); + let expected = alpha_i * nb_mutations(e, loc.sequence_length); + ln_poisson_density(actual, expected) + }) + .sum::<f64>() + .exp() } } @@ -241,18 +239,14 @@ fn ln_poisson_density(n: u64, lambda: f64) -> f64 { // integrating over all possible scenarios. impl GeneTriplet { pub fn likelihood(&self, p: &Parameters<f64>) -> f64 { - // Only sum over concordant scenarios. let mut prob = 0.; for scenario in Scenario::iter(p.gf_times.0.len()) { - if let Some(top) = self.local.topology { - if top != scenario.topology() { - continue; - } + if self.contributes(&scenario) { + let prob_scenario = scenario.probability(p); + let expected_lengths = scenario.branches_lengths(p); + let prob_branches = self.branches_lengths_likelihood(expected_lengths); + prob += prob_scenario * prob_branches; } - let prob_scenario = scenario.probability(p); - let expected_lengths = scenario.branches_lengths(p); - let prob_branches = self.branches_lengths_likelihood(expected_lengths); - prob += prob_scenario * prob_branches; } prob } @@ -309,8 +303,7 @@ pub(crate) fn likelihood_tensors( let prior = scenario.probability_tensors(&parms); // Expected branch lengths. - let [ref s, ref l] = scenario.branches_lengths_tensors(&parms); - let expected_abcd = Tensor::concatenate(&[s, s, &(l - s), l], 0); + let expected_abcd = scenario.branches_lengths_tensor(&parms); // Log-densities of poisson distribution. let lambda = Tensor::outer(alphaseq, &expected_abcd); @@ -409,14 +402,11 @@ fn data_tensors( // later combined into a sum. // For every scenario, pick indices of the trees matching it. let mut match_trees: Vec<_> = iter::repeat_with(Vec::new).take(n_scenarios).collect(); - for (i_tree, trip) in triplets.iter().enumerate() { + for (i_tree, triplet) in triplets.iter().enumerate() { for (i_scenario, scenario) in scenarios.iter().enumerate() { - if let Some(top) = trip.local.topology { - if top != scenario.topology() { - continue; - } + if triplet.contributes(scenario) { + match_trees[i_scenario].push(i64::try_from(i_tree).unwrap()); } - match_trees[i_scenario].push(i64::try_from(i_tree).unwrap()); } } let scenario_trees_index = match_trees