diff --git a/doc/src/fdetail b/doc/src/fdetail new file mode 100755 index 0000000000000000000000000000000000000000..2b084498323831d02573a32d64e463ca38fcf6c2 --- /dev/null +++ b/doc/src/fdetail @@ -0,0 +1,2 @@ +# (just to ease doc generation) +./structfield output::detail::$1 $2 diff --git a/doc/src/fglobal b/doc/src/fglobal new file mode 100755 index 0000000000000000000000000000000000000000..96b812c517d36d1ae59c576f277541ae2d6d1a98 --- /dev/null +++ b/doc/src/fglobal @@ -0,0 +1,2 @@ +# (just to ease doc generation) +./structfield output::global::$1 $2 diff --git a/doc/src/learn.md b/doc/src/learn.md index 9a3a7b67995016fde0bae5663839ec3dfbd2bcf3..23b84dbf2212e66d8533ef4ab2859d0ad0ffb5a3 100644 --- a/doc/src/learn.md +++ b/doc/src/learn.md @@ -112,6 +112,16 @@ Instead of the constrained \\(𝜃\\), \\(𝜏_1\\), \\(𝜏_2\\) *etc.*, the concrete optimisation targets for aphid are the unconstrained \\(s_𝜃\\), \\(s_{𝜏_1}\\), \\(s_{𝛥𝜏}\\) *etc.* -## Maximizing likelihood - -<!-- HERE: into BFGS + Wolfe --> +## Maximizing likelihood { #opt } + +<div class="warning"> +TODO: Detail BFGS + Wolfe search heuristic method + configuration options. +</div> + +While this section is not written yet, +the reader can refer to +[Nocedal & Wright (2006)](https://doi.org/10.1007/978-0-387-40065-5) +for a description of the algorithm used +and to [`src/optim/bfgs.rs`](https://gitlab.mbb.cnrs.fr/ibonnici/aphid/-/blob/rust-implementation/src/optim/bfgs.rs?ref_type=heads) +and [`src/optim/wolfe_search.rs`](https://gitlab.mbb.cnrs.fr/ibonnici/aphid/-/blob/rust-implementation/src/optim/wolfe_search.rs?ref_type=heads) +for implementation details. diff --git a/doc/src/output.md b/doc/src/output.md index b0d84e96bb994a874e79333bc9261e435aa55ed1..281cde5cfc0fe0383e9254eb977dc50554e4897f 100644 --- a/doc/src/output.md +++ b/doc/src/output.md @@ -1 +1,147 @@ # Outputs + +When run, for instance with: + +```sh +$ aphid ./config.toml output +``` + + +Aphid displays summarized information about the analysis +on the console standard output. +This output is only meant as an informative summary for humans. +It is not supposed to be easily processed by downstream programs +and its layout can arbitrarily change in the future. + +In addition, +aphid outputs very detailed structured information about the analysis +under the form of a collection of files in a newly created folder +named after its second argument (the `output/` folder in the above example). +Here is what this folder contains: + +``` +<OUTPUT>/ # The folder name as per the second argument given on the command line. +├── config.json # Complete information about the configuration used for this run. +├── global.json # The most important "forest-level" results. +├── detail.json # Detailed results per gene tree. +├── trees.csv # Summarized results per gene tree in tabular form. +└── search/ # Traces of the heuristic searches for likelihood (one per starting point). + ├── 1/ + │ ├── init.csv # Starting point for this search. + │ ├── status.json # Result of the search (error or best parameters found). + │ ├── global.csv # Every BFGS step. + │ └── detail.csv # Every linear search step. + ├── 2/ … + ├── 3/ … + ⋮ +``` + +These files are formally structured with `.json` or `.csv` format +to ease their processing by downstream programs. +The meaning of their content is detailed below: + +<!-- toc --> + +<!-- TODO have rustdoc automatically list the fields? --> + +## The `global.json` file +<!-- cmdrun ./doc output::global::Global --> + +<!-- cmdrun ./fglobal Global n_trees --> +<!-- cmdrun ./fglobal Global n_excluded_triplets_topologies --> +<!-- cmdrun ./fglobal Global n_unresolved_triplets --> +<!-- cmdrun ./fglobal Global n_excluded_outgroup_topologies --> +<!-- cmdrun ./fglobal Global n_excluded_topologies --> +<!-- cmdrun ./fglobal Global mean_branch_length --> +<!-- cmdrun ./fglobal Global mean_length_triplet --> +<!-- cmdrun ./fglobal Global mean_length_outgroup_other --> +<!-- cmdrun ./fglobal Global imbalance --> +<!-- cmdrun ./fglobal Global triplet_longer --> +<!-- cmdrun ./fglobal Global shape --> +<!-- cmdrun ./fglobal Global n_excluded_geometries --> +<!-- cmdrun ./fglobal Global n_included_trees --> +<!-- cmdrun ./fglobal Global estimate --> + <!-- cmdrun ./fglobal Estimate ln_likelihood --> + <!-- cmdrun ./fglobal Estimate parameters --> + +## The `detail.json` file +<!-- cmdrun ./doc output::detail::Tree --> + +<!-- cmdrun ./fdetail Tree id --> +<!-- cmdrun ./fdetail Tree n_bases --> +<!-- cmdrun ./fdetail Tree n_nodes_raw --> +<!-- cmdrun ./fdetail Tree n_nodes_pruned --> +<!-- cmdrun ./fdetail Tree triplet --> + <!-- cmdrun ./fdetail Triplet lca --> + <!-- cmdrun ./fdetail Triplet missing --> + <!-- cmdrun ./fdetail Triplet paraphyletic --> + <!-- cmdrun ./fdetail Triplet analysis --> + <!-- cmdrun ./fdetail TripletAnalysis topology --> + <!-- cmdrun ./fdetail TripletAnalysis branches_lengths --> + <!-- cmdrun ./fdetail TripletAnalysis resolved --> + <!-- cmdrun ./fdetail Triplet included --> +<!-- cmdrun ./fdetail Tree outgroup --> + <!-- cmdrun ./fdetail Outgroup lca --> + <!-- cmdrun ./fdetail Outgroup missing --> + <!-- cmdrun ./fdetail Outgroup paraphyletic --> + <!-- cmdrun ./fdetail Outgroup included --> +<!-- cmdrun ./fdetail Tree top --> + <!-- cmdrun ./fdetail Top lca --> + <!-- cmdrun ./fdetail Top internal --> + <!-- cmdrun ./fdetail Internal triplet --> + <!-- cmdrun ./fdetail Internal outgroup --> + <!-- cmdrun ./fdetail Top external --> + <!-- cmdrun ./fdetail Top included --> +<!-- cmdrun ./fdetail Tree topology_included --> +<!-- cmdrun ./fdetail Tree mean_lengths --> + <!-- cmdrun ./fdetail MeanLengths total --> + <!-- cmdrun ./fdetail MeanLengths triplet --> + <!-- cmdrun ./fdetail MeanLengths outgroup_other --> +<!-- cmdrun ./fdetail Tree local_shape --> +<!-- cmdrun ./fdetail Tree geometry_included --> +<!-- cmdrun ./fdetail Tree mutation_rate --> +<!-- cmdrun ./fdetail Tree ln_likelihood --> + +## The summarized `trees.csv` table + +There is one line in this table per gene tree analyzed. +Columns represent a redundant, flattened version +of the structured information available in the above `detail.json` file, +but we expect that it be easier to work with +using table-processing downstream software. + +## The `search/` traces + +Since several starting point may be used +in the likelihood maximization heuristics, +aphid may produce several exploration traces stored within subfolders here. + +### The `init.json` file. + +This file is a reminder which initial parameters have been used for the search. + +### The `status.json` summary. + +This file summarizes the terminal status of the search: +either detail about search failure if it failed, +or the following information: + +<!-- cmdrun ./structfield learn::BestFound parameters --> +<!-- cmdrun ./structfield learn::BestFound scores --> +<!-- cmdrun ./structfield learn::BestFound gradient --> +<!-- cmdrun ./structfield learn::BestFound ln_likelihood --> +<!-- cmdrun ./structfield learn::BestFound n_evaluations --> +<!-- cmdrun ./structfield learn::BestFound n_differentiations --> + +### The `global.csv` trace. + +If produced, +there is one line in this table per BFGS step taken during the search. +(see [BFGS](./learn.md#opt)) + +### The `detail.csv` trace. + +If produced, +there is one line in this table per linear search step taken +for every BFGS step in search for a step size meeting strong Wolfe criteria. +(see [Linear Search](./learn.md#opt)) diff --git a/doc/src/structfield b/doc/src/structfield new file mode 100755 index 0000000000000000000000000000000000000000..c1b0c509d8292df72c880baa97ba4c13884fd8c8 --- /dev/null +++ b/doc/src/structfield @@ -0,0 +1,4 @@ +# Fetch a struct field docstring to display as a markdown items list. +struct=$1 +field=$2 +echo "- \`$2\`: $(./doc $struct::$field)" diff --git a/src/bin/aphid/main.rs b/src/bin/aphid/main.rs index 7b0dab7d128d8c25b791aa7f8dc5e81f9898c5c3..40ac2ce9965b52676de7cfd5741fcf2b00647983 100644 --- a/src/bin/aphid/main.rs +++ b/src/bin/aphid/main.rs @@ -15,7 +15,7 @@ use aphid::{ io::{self, ToRelative}, it_mean::SummedMean, learn::heuristic_starting_point, - ln_likelihood, optimize_likelihood, + optimize_likelihood, output::{self, detail, file}, Config, GeneTree, GeneTriplet, GenesForest, LocalGeneTriplet, Parameters, VERSION, }; @@ -96,16 +96,15 @@ fn run() -> Result<(), Error> { // Extract the only information needed for likelihood calculation. let triplets = final_tree_selection(included, local_triplets, &mut global, &mut details); - // Output full per-tree detail. - write_detail(&output, &details)?; - write_csv(&output, &details)?; - // Fit the model. - learn(&triplets, &mut global, &config)?; + let res = learn(&triplets, &mut global, &config, &mut details); + // Output full per-tree detail *even* if learning failed. + write_detail(&output, &details)?; + write_csv(&output, &details)?; write_global(&output, &global)?; - Ok(()) + res } fn read_inputs( @@ -426,33 +425,6 @@ fn final_tree_selection( triplets } -fn write_detail(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { - let path = output.join(file::DETAIL); - cprintln!( - "Write full trees analysis/filtering detail to <b>{}</>.", - path.to_relative()?.display() - ); - io::write_file!(&path, "detailed results", "{:#}", json!(details))?; - Ok(()) -} - -fn write_csv(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { - let path = output.join(file::CSV); - cprintln!( - "Summarize scalar values to <b>{}</>.", - path.to_relative()?.display() - ); - let file = io::create_file(&path, "csv summary")?; - let mut wtr = csv::Writer::from_writer(file); - for detail in details { - let record: output::csv::Record = detail.into(); - wtr.serialize(record).context(CsvErr)?; - } - wtr.flush() - .with_context(|_| io::AtErr { ctx: "flushing csv summary", path })?; - Ok(()) -} - fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config) { println!("\nSummary:"); @@ -570,6 +542,7 @@ fn learn( triplets: &[GeneTriplet], global: &mut output::Global, config: &Config, + detail: &mut [detail::Tree], ) -> Result<(), Error> { println!("\nLearn:"); @@ -586,14 +559,23 @@ fn learn( cprintln!("Optimized ln-likelihood: <g>{}</>", best.ln_likelihood); println!("Optimized parameters: {}\n", best.parameters.colored()); + // Recalculate ln-likelihood per tree for the detailed output. + let mut detail = detail.iter_mut().filter(|gt| gt.geometry_included); + let mut sum = 0.; + for gtrip in triplets { + let lnl = gtrip.likelihood(&best.parameters).ln(); + sum += lnl; + detail.next().unwrap().ln_likelihood = Some(lnl); + } + assert!(detail.next().is_none()); + // Smoke test: // TODO: turn into actual testing by including 'official' example data. - let r = ln_likelihood(triplets, &best.parameters); let t = best.ln_likelihood; assert!( - float_eq!(t, r, abs <= 1e-6), + float_eq!(t, sum, abs <= 1e-6), "Discrepancy between regular and tensor likelihood calculations?\n\ - regular: {r}\n\ + regular: {sum}\n\ tensors: {t}\n" ); @@ -612,6 +594,33 @@ fn write_global(output: &Path, global: &output::Global) -> Result<(), Error> { Ok(()) } +fn write_detail(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { + let path = output.join(file::DETAIL); + cprintln!( + "Write full trees analysis/filtering detail to <b>{}</>.", + path.to_relative()?.display() + ); + io::write_file!(&path, "detailed results", "{:#}", json!(details))?; + Ok(()) +} + +fn write_csv(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { + let path = output.join(file::CSV); + cprintln!( + "Summarize scalar values to <b>{}</>.", + path.to_relative()?.display() + ); + let file = io::create_file(&path, "csv summary")?; + let mut wtr = csv::Writer::from_writer(file); + for detail in details { + let record: output::csv::Record = detail.into(); + wtr.serialize(record).context(CsvErr)?; + } + wtr.flush() + .with_context(|_| io::AtErr { ctx: "flushing csv summary", path })?; + Ok(()) +} + #[derive(Debug, Snafu)] #[snafu(context(suffix(Err)))] enum Error { diff --git a/src/learn.rs b/src/learn.rs index 8e7f279fede34c49eb9e4100a36ead35d1e3e0a3..7d05c0ddb5262f23bb8d1f38381caef5dd0a327d 100644 --- a/src/learn.rs +++ b/src/learn.rs @@ -23,7 +23,7 @@ use crate::{ scenarios::Scenario, scores::{self, Scores}, }, - optim::{self, OptimResult}, + optim::{self, bfgs, OptimResult}, output, GeneTriplet, Parameters, }; @@ -33,13 +33,19 @@ pub type Status = Result<BestFound, optim::Error>; // Return both optimized inputs and output. #[derive(Debug, Serialize)] pub struct BestFound { + /// The best parameters found during this search. pub parameters: Parameters<f64>, + /// The unconstrained scores corresponding to these parameters. #[serde(serialize_with = "scores::ser")] pub scores: Scores<f64>, + /// The corresponding gradient values for the scores at this point. #[serde(serialize_with = "scores::ser")] pub gradient: Scores<Option<f64>>, + /// The best likelihood value found. pub ln_likelihood: f64, + /// The number of times the likelihood function has been evaluated to find this result. pub n_evaluations: u64, + /// The number of times the likelihood *derivative* has been evaluated to find this result. pub n_differentiations: u64, } @@ -102,19 +108,27 @@ pub fn optimize_likelihood( search: &[Search], ) -> Result<Vec<Status>, Error> { println!("Launch {} independent optimisation searches:", starts.len()); + let statuses = starts .par_iter() .zip(search.par_iter()) .map(|(p, s)| -> Result<Status, Error> { + write_to_file( + &p, + s.bfgs.log.as_ref(), + output::file::trace::INIT, + "init", + "optimisation starting point", + )?; let status = optimize_likelihood_single(triplets, p, s)?; // Write status in adacent file. - if let Some(log) = &s.bfgs.log { - let mut path = log.folder.clone(); - path.push(output::file::trace::STATUS); - let file = io::create_file(&path, "status")?; - serde_json::to_writer_pretty(file, &status) - .with_context(|_| SerErr { what: "optimisation status" })?; - } + write_to_file( + &status, + s.bfgs.log.as_ref(), + output::file::trace::STATUS, + "status", + "optimisation status", + )?; Ok(status) }) // There are two nested levels of "errors" here: @@ -136,6 +150,22 @@ pub fn optimize_likelihood( Ok(statuses) } +fn write_to_file( + content: &impl Serialize, + log: Option<&optim::bfgs::Log>, + filename: &'static str, + short: &'static str, + what: &'static str, +) -> Result<(), Error> { + if let Some(log) = log { + let mut path = log.folder.clone(); + path.push(filename); + let file = io::create_file(&path, short)?; + serde_json::to_writer_pretty(file, &content).with_context(|_| SerErr { what })?; + } + Ok(()) +} + // Optimize from one single starting point. pub fn optimize_likelihood_single( // Receive in this format for better locality. diff --git a/src/output/csv.rs b/src/output/csv.rs index ef2e01228d17c978a7daf56d39211cf72e392817..932513b8768f269dc516f96628c2785ec05a646f 100644 --- a/src/output/csv.rs +++ b/src/output/csv.rs @@ -60,6 +60,7 @@ pub struct Record<'i> { //---------------------------------------------------------------------------------------------- mutation_rate: Option<f64>, + ln_likelihood: Option<f64>, } //================================================================================================== @@ -100,6 +101,7 @@ impl<'i> From<&Tree<'i>> for Record<'i> { local_shape, geometry_included, mutation_rate, + ln_likelihood, } = tree; // Deeper into (optional) triplet information destructuring. @@ -164,6 +166,7 @@ impl<'i> From<&Tree<'i>> for Record<'i> { local_shape, geometry_included, mutation_rate, + ln_likelihood, } } } diff --git a/src/output/detail.rs b/src/output/detail.rs index b5feebdb167d5d047e870bfe82d4e31aa3d131e1..f5aa43ab4254f35707961baeb344eb3bb37d6584 100644 --- a/src/output/detail.rs +++ b/src/output/detail.rs @@ -14,7 +14,7 @@ use crate::{ }; /// All user-facing information used or produced by aphid -/// regarding one particular gene tree. +/// regarding each particular gene tree. #[derive(Serialize, Default)] pub struct Tree<'i> { // Borrow from species/trees identifiers. @@ -58,6 +58,9 @@ pub struct Tree<'i> { /// Estimated mutation rate for this tree. /// (No estimate if the tree was excluded from analysis.) pub mutation_rate: Option<f64>, + + /// Likelihood of this single tree, provided it was included in the analysis. + pub ln_likelihood: Option<f64>, } #[derive(Serialize, Default)] @@ -86,11 +89,12 @@ pub struct TripletAnalysis { /// assuming the reference topology was 'ABC' ~ '((A, B), C)'. pub topology: TripletTopology, - /// Estimate of the number of mutations along the triplet branches. - /// The internal branch length 'd' either represent: - /// - 'ab' in ((:a, :b):ab, :c) for topology 'ABC'. - /// - 'ac' in (:b, (:a, :c):ac) for topology 'ACB'. - /// - 'bc' in (:a, (:c, :b):bc) for topology 'BCA'. + /// Estimate of the number of mutations + /// along the triplet branches `[a, b, c, d]`. + /// The internal branch length `d` either represent: + /// - `ab` in `((:a, :b):ab, :c)` for topology `ABC`. + /// - `ac` in `(:b, (:a, :c):ac)` for topology `ACB`. + /// - `bc` in `(:a, (:c, :b):bc)` for topology `BCA`. pub branches_lengths: TripletLengths, /// True if the topology is considered sufficiently resolved @@ -158,7 +162,7 @@ pub struct MeanLengths { pub triplet: Option<BranchLength>, /// Calculated over the designated outgroup species found - /// + the species designated as 'other'. + /// plus the species designated as 'other'. pub outgroup_other: Option<BranchLength>, }