From 08c4a8f173e6df546a885d504a8a12107a7ff2f7 Mon Sep 17 00:00:00 2001 From: Iago Bonnici <iago.bonnici@umontpellier.fr> Date: Wed, 16 Oct 2024 18:21:45 +0200 Subject: [PATCH] Heuristic starting points and parallel searches. --- Cargo.lock | 50 ++++ Cargo.toml | 2 + doc/src/use.md | 37 ++- src/bin/aphid/main.rs | 142 ++++++++--- src/config.rs | 22 +- src/config/defaults.rs | 46 ++-- src/config/{check.rs => expand.rs} | 378 ++++++++++++++++++++--------- src/config/raw.rs | 47 ++-- src/gene_tree/parse.rs | 8 +- src/interner.rs | 2 +- src/io.rs | 48 +++- src/learn.rs | 307 ++++++++++++----------- src/lexer.rs | 2 +- src/lib.rs | 2 +- src/model/likelihood.rs | 16 +- src/model/parameters.rs | 32 ++- src/model/scores.rs | 64 ++++- src/optim.rs | 35 ++- src/optim/bfgs.rs | 199 +++++++++------ src/optim/gd.rs | 15 +- src/optim/io.rs | 37 +++ src/optim/tensor.rs | 21 ++ src/optim/wolfe_search.rs | 143 ++++++----- src/output.rs | 42 +++- src/output/config.rs | 22 +- src/tree/ancestry.rs | 2 +- src/tree/progeny.rs | 2 +- 27 files changed, 1175 insertions(+), 548 deletions(-) rename src/config/{check.rs => expand.rs} (68%) create mode 100644 src/optim/io.rs diff --git a/Cargo.lock b/Cargo.lock index 25efee8..f503c0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -129,10 +129,12 @@ dependencies = [ "itertools", "lexical-sort", "num-traits", + "ordinal", "paste", "pathdiff", "rand", "rand_distr", + "rayon", "regex", "rustdoc-types", "serde", @@ -366,6 +368,25 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -854,6 +875,15 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "ordinal" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c80c1530f46e9d8985706d7deb80b83172b250538902f607dea6cd6028851083" +dependencies = [ + "num-integer", +] + [[package]] name = "password-hash" version = "0.4.2" @@ -980,6 +1010,26 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex" version = "1.11.0" diff --git a/Cargo.toml b/Cargo.toml index 931ee08..80424e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,8 @@ color-print-proc-macro = { git = "https://gitlab.com/iago-lito/color-print", bra csv = "1.3.0" pathdiff = "0.2.1" rustdoc-types = {version = "0.30.0", optional = true } +ordinal = "0.3.2" +rayon = "1.10.0" [dev-dependencies] rand = "0.8.5" diff --git a/doc/src/use.md b/doc/src/use.md index 84b6ac5..4fd5604 100644 --- a/doc/src/use.md +++ b/doc/src/use.md @@ -89,19 +89,40 @@ See the [topology filter](preprocess.md#topology). See the [geometry filter](preprocess.md#geometry). ### The `[init]` table -<!-- cmdrun ./doc config::raw::Config::init --> + +This additional table can be set to specify +starting point(s) for likelihood exploration. +Provide several values to run several independent optimisation procedures +and explore different areas of the likelihood surface. +Missing parameters will be assigned a default value +based on the data at hand according to aphid's [internal heuristic], +for which only `theta` value is necessary. +Defaults to: +<!-- cmdrun ./doc config::defaults::DEFAULT_INIT_THETAS --> + For example: ```toml -[init] # TODO: provide a more meaningful example? -theta = 5e-2 -tau_1 = 6e-2 -tau_2 = 7e-2 -p_ab = 0.1 -p_ac = 0.2 +# Launch one optimisation run from exactly this point in parameters space. +[init] +theta = 5e-3 +tau_1 = 2e-3 +tau_2 = 5e-3 +p_ab = 0.3 +p_ac = 0.3 p_bc = 0.3 -p_ancient_gf = 0.4 +p_ancient_gf = 0.7 +``` + +Alternately: +```toml +# Launch three independent optimisation runs. +# Aphid's heuristic will infer values for the missing parameters. +[init] +theta = [5e-4, 5e-3, 5e-2] # Necessary to the heuristic. +tau_2 = 2e-2 # Force (same for the three runs). +p_ac = [0.1, 0.2, 0.3] # Force (one value for each run). ``` ### The `[search]` table diff --git a/src/bin/aphid/main.rs b/src/bin/aphid/main.rs index 2a41b6e..7b0dab7 100644 --- a/src/bin/aphid/main.rs +++ b/src/bin/aphid/main.rs @@ -1,6 +1,7 @@ use std::{ + cell::LazyCell, collections::HashSet, - fs::{self, File}, + fs::{self}, io::Write as IoWrite, path::{self, Path, PathBuf}, process, @@ -8,13 +9,15 @@ use std::{ }; use aphid::{ + config::expand, extract_local_triplet, imbalance, interner::{Interner, ResolvedSymbol, SpeciesSymbol}, - io::ToRelative, + io::{self, ToRelative}, it_mean::SummedMean, + learn::heuristic_starting_point, ln_likelihood, optimize_likelihood, output::{self, detail, file}, - Config, GeneTree, GeneTriplet, GenesForest, LocalGeneTriplet, VERSION, + Config, GeneTree, GeneTriplet, GenesForest, LocalGeneTriplet, Parameters, VERSION, }; use clap::Parser; use color_print::{ceprintln, cformat, cprintln}; @@ -49,17 +52,18 @@ fn run() -> Result<(), Error> { let interner = &mut interner; // Canonicalize files. - let output = path::absolute(&args.output)?; + let output = path::absolute(&args.output) + .with_context(|_| io::AtErr { ctx: "searching absolute path", path: &args.output })?; // Read data from the various files. - let (config, forest) = read_inputs(&args, &output, interner)?; + let (config, subfolders, forest) = read_inputs(&args, &output, interner)?; // Check overall consistency. // Extract the set of designated species = 'triplet' + 'outgroup' + 'other'. let designated_species = check_input_consistency(&forest, &config, interner)?; // Prepare output folder. - prepare_output(&output, &args)?; + prepare_output(&output, &subfolders, &args)?; // Output full configuration detail. write_config(&output, &config, interner)?; @@ -108,17 +112,17 @@ fn read_inputs( args: &args::Args, output_folder: &Path, interner: &mut Interner, -) -> Result<(Config, GenesForest), Error> { - let path = &args.config.canonicalize()?; +) -> Result<(Config, Vec<PathBuf>, GenesForest), Error> { + let path = io::canonicalize(&args.config)?; cprintln!("Read config from <b>{}</>.", path.display()); - let config = Config::from_file(path, output_folder, interner)?; + let (config, output_subfolders) = Config::from_file(&path, output_folder, interner)?; let path = &config.trees; cprintln!("Read gene trees from <b>{}</>:", path.display()); let forest = GenesForest::from_file(path, interner)?; cprintln!(" Found <g>{}</> gene trees.", forest.len()); - Ok((config, forest)) + Ok((config, output_subfolders, forest)) } fn check_input_consistency( @@ -155,23 +159,31 @@ fn check_input_consistency( Ok(designated_species) } -fn prepare_output(output: &Path, args: &args::Args) -> Result<(), Error> { +fn prepare_output(output: &Path, subfolders: &[PathBuf], args: &args::Args) -> Result<(), Error> { println!("Prepare output folder:"); match (output.exists(), args.force) { (true, true) => { cprintln!( " <y>Replacing</> existing folder: <b>{}</>.", - output.display() + output.to_relative()?.display() ); - fs::remove_dir_all(output)?; + fs::remove_dir_all(output) + .with_context(|_| io::AtErr { ctx: "removing folder", path: output })?; } (true, false) => OutputExistsErr { path: &output }.fail()?, (false, _) => { cprintln!(" Creating empty folder: <b>{}</>.", output.display()); } }; - fs::create_dir_all(output)?; + fs::create_dir_all(output) + .with_context(|_| io::AtErr { ctx: "creating output folder", path: output })?; + + for path in subfolders { + fs::create_dir_all(path) + .with_context(|_| io::AtErr { ctx: "creating output subfolder", path })?; + } + Ok(()) } @@ -181,8 +193,7 @@ fn write_config(output: &Path, config: &Config, interner: &Interner) -> Result<( " Write full configuration to <b>{}</>.", path.to_relative()?.display() ); - let mut file = File::create(path)?; - writeln!(file, "{:#}", config.resolve(interner).json())?; + io::write_file!(&path, "config", "{:#}", config.resolve(interner).json())?; Ok(()) } @@ -421,8 +432,7 @@ fn write_detail(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { "Write full trees analysis/filtering detail to <b>{}</>.", path.to_relative()?.display() ); - let mut file = File::create(path)?; - writeln!(file, "{:#}", json!(details))?; + io::write_file!(&path, "detailed results", "{:#}", json!(details))?; Ok(()) } @@ -432,13 +442,14 @@ fn write_csv(output: &Path, details: &[detail::Tree]) -> Result<(), Error> { "Summarize scalar values to <b>{}</>.", path.to_relative()?.display() ); - let file = File::create(path)?; + let file = io::create_file(&path, "csv summary")?; let mut wtr = csv::Writer::from_writer(file); for detail in details { let record: output::csv::Record = detail.into(); wtr.serialize(record).context(CsvErr)?; } - wtr.flush()?; + wtr.flush() + .with_context(|_| io::AtErr { ctx: "flushing csv summary", path })?; Ok(()) } @@ -502,22 +513,83 @@ fn display_summary(included: &[usize], details: &[detail::Tree], config: &Config cprintln!("==> <s,g>{n}</> tree{} kept for analysis.", s(n)); } +#[allow(clippy::similar_names)] // (ab, ac, bc) +fn fill_init_parms( + config: &Config, + triplets: &[GeneTriplet], +) -> Result<Vec<Parameters<f64>>, Error> { + println!("Prepare starting points."); + let mut res = Vec::new(); + for (i, search) in config.searches.iter().enumerate() { + let input = &search.init_parms; + // Some parameters are reliably input. + let theta = input.theta.expect("Input cannot contain missing theta."); + let gf_times = input + .gf_times + .map(|t| t.expect("Input cannot contain missing GF time.")); + + // Others may need to be calculated from the default. + let default = LazyCell::new(|| heuristic_starting_point(triplets, theta, &gf_times)); + let tau_1 = input.tau_1.unwrap_or_else(|| default.tau_1); + let tau_2 = input.tau_2.unwrap_or_else(|| default.tau_2); + let p_ab = input.p_ab.unwrap_or_else(|| default.p_ab); + let p_ac = input.p_ac.unwrap_or_else(|| default.p_ac); + let p_bc = input.p_bc.unwrap_or_else(|| default.p_bc); + let p_ancient = input.p_ancient.unwrap_or_else(|| default.p_ancient); + + // Late transversal checks. + let index = |o: Option<_>| { + move || if o.is_some() { format!("[{i}]") } else { String::from("[:auto]") } + }; + expand::tau_values( + &Some(tau_1), + &Some(tau_2), + index(input.tau_1), + index(input.tau_2), + )?; + expand::probability_sums( + [&Some(p_ab), &Some(p_ac), &Some(p_bc)], + [&index(input.p_ab), &index(input.p_ac), &index(input.p_bc)], + )?; + + res.push(Parameters::<f64> { + theta, + tau_1, + tau_2, + p_ab, + p_ac, + p_bc, + p_ancient, + gf_times, + }); + } + Ok(res) +} + fn learn( triplets: &[GeneTriplet], global: &mut output::Global, config: &Config, ) -> Result<(), Error> { - println!("\nLearning:"); + println!("\nLearn:"); + + // Spin several parallell searches. + let parms = fill_init_parms(config, triplets)?; + let results = optimize_likelihood(triplets, &parms, &config.searches)?; - let parms = &config.search.init_parms; - let (opt, opt_lnl) = optimize_likelihood(triplets, parms, &config.search)?; - cprintln!("Optimized ln-likelihood: <g>{opt_lnl}</>"); - println!("Optimized parameters: {}\n", opt.colored()); + println!("\nPick best result among heuristics:"); + let best = results + .into_iter() + .filter_map(Result::ok) + .max_by(|a, b| a.ln_likelihood.total_cmp(&b.ln_likelihood)) + .unwrap(); + cprintln!("Optimized ln-likelihood: <g>{}</>", best.ln_likelihood); + println!("Optimized parameters: {}\n", best.parameters.colored()); // Smoke test: // TODO: turn into actual testing by including 'official' example data. - let r = ln_likelihood(triplets, &opt); - let t = opt_lnl; + let r = ln_likelihood(triplets, &best.parameters); + let t = best.ln_likelihood; assert!( float_eq!(t, r, abs <= 1e-6), "Discrepancy between regular and tensor likelihood calculations?\n\ @@ -525,9 +597,8 @@ fn learn( tensors: {t}\n" ); - global.estimate.ln_likelihood = opt_lnl; - global.estimate.parameters = opt; - + global.estimate.ln_likelihood = best.ln_likelihood; + global.estimate.parameters = best.parameters; Ok(()) } @@ -537,8 +608,7 @@ fn write_global(output: &Path, global: &output::Global) -> Result<(), Error> { "Write global analysis results detail to <b>{}</>.", path.to_relative()?.display() ); - let mut file = File::create(path)?; - writeln!(file, "{:#}", json!(global))?; + io::write_file!(&path, "global results", "{:#}", json!(global))?; Ok(()) } @@ -546,9 +616,9 @@ fn write_global(output: &Path, global: &output::Global) -> Result<(), Error> { #[snafu(context(suffix(Err)))] enum Error { #[snafu(transparent)] - Io { source: std::io::Error }, - #[snafu(transparent)] - Check { source: aphid::config::check::Error }, + Check { + source: aphid::config::expand::Error, + }, // TODO: have color_print feature this without the need to allocate an intermediate string? #[snafu(display("Output folder already exists: {}.", cformat!("<b>{}</>", path.display())))] OutputExists { path: PathBuf }, @@ -564,4 +634,6 @@ enum Error { Csv { source: csv::Error }, #[snafu(transparent)] AphidIo { source: aphid::io::Error }, + #[snafu(transparent)] + Optim { source: aphid::optim::Error }, } diff --git a/src/config.rs b/src/config.rs index 62cc641..1207781 100644 --- a/src/config.rs +++ b/src/config.rs @@ -11,14 +11,12 @@ use std::path::PathBuf; use serde::Serialize; use crate::{ - gene_tree::MeanNbBases, - interner::SpeciesSymbol, - optim::{bfgs::Config as BfgsConfig, gd::Config as GdConfig}, + gene_tree::MeanNbBases, interner::SpeciesSymbol, optim::bfgs::Config as BfgsConfig, BranchLength, Parameters, }; -pub mod check; pub(crate) mod defaults; +pub mod expand; pub mod raw; /// The final configuration value handed out to user. @@ -39,8 +37,8 @@ pub struct Config { pub unresolved_length: Option<MeanNbBases>, - // Parametrize the search. - pub search: Search, + // Parametrize the searches. + pub searches: Vec<Search>, } pub const MAX_UNRESOLVED_LENGTH: f64 = 0.5; @@ -71,15 +69,11 @@ pub struct SpeciesTriplet { #[derive(Debug, Serialize)] pub struct Search { // Starting point in the search space. - pub init_parms: Parameters<f64>, + // Missing values will be inferred from aphid init heuristic. + pub init_parms: Parameters<Option<f64>>, - // Dataset reduction factor - // when searching for non-degenerated starting point. - pub(crate) init_data_reduction_factor: f64, // Above 1. - - // Gradient search configuration - // when learning from a dataset reduction. - pub(crate) init_descent: GdConfig, + // (starting heuristics config: + // initial simple gradient descent + data reduction used to be here) // Main learning phase configuration. pub(crate) bfgs: BfgsConfig, diff --git a/src/config/defaults.rs b/src/config/defaults.rs index 7c2c0bd..2cd2b33 100644 --- a/src/config/defaults.rs +++ b/src/config/defaults.rs @@ -1,22 +1,24 @@ // Decisions for anything not user-provided. -use super::raw::RecordTrace; +use super::raw::{ListOrScalar, RecordTrace}; use crate::{ - config::raw::{ - BfgsConfig, GdConfig, InitialParameters, Search, SlopeTrackingConfig, WolfeSearchConfig, - }, + config::raw::{BfgsConfig, GdConfig, Init, Search, SlopeTrackingConfig, WolfeSearchConfig}, Filters, }; -pub(crate) fn initial_parameters() -> InitialParameters { - InitialParameters { - theta: 0.005, - tau_1: 0.002, - tau_2: 0.005, - p_ab: 0.3, - p_ac: 0.3, - p_bc: 0.3, - p_ancient_gf: 0.7, +const DEFAULT_INIT_THETAS: &[f64] = &[1e-4, 1e-3, 1e-2, 1e-1]; +pub(crate) fn init_thetas() -> ListOrScalar<f64> { + ListOrScalar::List(DEFAULT_INIT_THETAS.into()) +} +pub(crate) fn init() -> Init { + Init { + theta: init_thetas(), + tau_1: None, + tau_2: None, + p_ab: None, + p_ac: None, + p_bc: None, + p_ancient_gf: None, } } @@ -32,23 +34,7 @@ pub(crate) fn filters() -> Filters { } pub(crate) fn search() -> Search { - Search { - init_data_reduction_factor: init_data_reduction_factor(), - init_descent: init_descent(), - bfgs: BfgsConfig::default(), - } -} - -pub(crate) fn init_data_reduction_factor() -> f64 { - 2.0 -} - -pub(crate) fn init_descent() -> GdConfig { - GdConfig { - max_iter: 1_000, - learning_rate: 1e-2, - slope_tracking: Some(SlopeTrackingConfig { sample_size: 10, threshold: 1e-1, grain: 5 }), - } + Search { bfgs: BfgsConfig::default() } } impl Default for BfgsConfig { diff --git a/src/config/check.rs b/src/config/expand.rs similarity index 68% rename from src/config/check.rs rename to src/config/expand.rs index 0bf5eba..23c1263 100644 --- a/src/config/check.rs +++ b/src/config/expand.rs @@ -3,7 +3,7 @@ use std::{ cmp::Ordering, - collections::HashSet, + collections::{HashMap, HashSet}, path::{Path, PathBuf}, }; @@ -11,6 +11,7 @@ use arrayvec::ArrayVec; use regex::Regex; use snafu::{ensure, Snafu}; +use super::raw::ListOrScalar; use crate::{ config::{ defaults, @@ -21,8 +22,11 @@ use crate::{ io::{self, read_file}, model::parameters::{GeneFlowTimes, MAX_N_GF_TIMES}, optim::{ - self, bfgs::Config as BfgsConfig, gd::Config as GdConfig, - wolfe_search::Config as WolfeConfig, SlopeTrackingConfig, + self, + bfgs::{self, Config as BfgsConfig}, + gd::Config as GdConfig, + wolfe_search::Config as WolfeConfig, + SlopeTrackingConfig, }, output::file, Config, Filters, Parameters, @@ -36,11 +40,12 @@ macro_rules! err { impl Config { // Read raw config from file, then check it and convert into a valid config value. + // Also return a set of output sub-folders to create. pub fn from_file( path: &Path, output_folder: &Path, interner: &mut Interner, - ) -> Result<Self, Error> { + ) -> Result<(Self, OutputFolders), Error> { // Parse raw TOML file. let input = read_file(path)?; let raw = raw::Config::parse(&input)?; @@ -64,8 +69,8 @@ impl Config { ); } - let search = Search::try_from(&raw, output_folder)?; - let taxa = raw.taxa._try_into(interner)?; + let (search, output_folders) = Search::expand_from(&raw, output_folder)?; + let taxa = raw.taxa.expand(interner)?; // Resolve relative paths relatively to the config file. let mut trees = raw.taxa.trees; @@ -84,17 +89,20 @@ impl Config { } // Most checks implemented within `TryFrom` trait. - Ok(Config { - search, - taxa, - trees, - unresolved_length: raw.unresolved_length, - filters: if let Some(ref raw) = raw.filters { - raw.try_into()? - } else { - defaults::filters() + Ok(( + Config { + searches: search, + taxa, + trees, + unresolved_length: raw.unresolved_length, + filters: if let Some(ref raw) = raw.filters { + raw.try_into()? + } else { + defaults::filters() + }, }, - }) + output_folders, + )) } // The union of all sections constitutes the set of "designated" species. @@ -107,6 +115,8 @@ impl Config { } } +type OutputFolders = Vec<PathBuf>; + //-------------------------------------------------------------------------------------------------- // Check filter parameters. impl TryFrom<&raw::Filters> for Filters { @@ -135,20 +145,28 @@ impl TryFrom<&raw::Filters> for Filters { // Check search configuration. impl Search { - fn try_from(raw: &raw::Config, output_folder: &Path) -> Result<Self, Error> { - let init_data_reduction_factor = raw.search.init_data_reduction_factor; - if init_data_reduction_factor <= 1.0 { - err!( - ("The data reduction factor to search for non-degenerated starting point \ - must be greater than 1. Received: {init_data_reduction_factor}.") - ); + fn expand_from( + raw: &raw::Config, + output_folder: &Path, + ) -> Result<(Vec<Self>, OutputFolders), Error> { + let inits = try_from_parameters(&raw.init, &raw.gf_times)?; + + // Prepare paths to traces if required. + let n = inits.len(); + let nd = n.checked_ilog10().unwrap_or(0) as usize + 1; // Number of digits for folder names. + let mut res = Vec::new(); + let mut output_folders = Vec::new(); + let base = output_folder.join(file::trace::FOLDER); + for (init_parms, i) in inits.into_iter().zip(1..) { + let id = format!("{i:0nd$}"); + let folder = base.join(&id); + let bfgs = BfgsConfig::try_from(&raw.search.bfgs, raw.gf_times.len(), id, &folder)?; + let search = Search { init_parms, bfgs }; + res.push(search); + output_folders.push(folder); } - Ok(Self { - init_parms: Parameters::try_from(&raw.init, &raw.gf_times)?, - init_data_reduction_factor, - init_descent: (&raw.search.init_descent).try_into()?, - bfgs: BfgsConfig::try_from(&raw.search.bfgs, raw.gf_times.len(), output_folder)?, - }) + + Ok((res, output_folders)) } } @@ -171,6 +189,7 @@ impl BfgsConfig { fn try_from( raw: &raw::BfgsConfig, n_gf_times: usize, + id: String, output_folder: &Path, ) -> Result<Self, Error> { let &raw::BfgsConfig { @@ -187,12 +206,13 @@ impl BfgsConfig { must be null or positive. Received: {step_size_threshold}.") ) ); - let (main_trace_path, linsearch_trace_path) = match record_trace { - RecordTrace::No => (None, None), - RecordTrace::Global => (Some(output_folder.join(file::MAIN_TRACE)), None), + let (log, main, linsearch) = match record_trace { + RecordTrace::No => (false, None, None), + RecordTrace::Global => (true, Some(output_folder.join(file::trace::GLOBAL)), None), RecordTrace::Detail => ( - Some(output_folder.join(file::MAIN_TRACE)), - Some(output_folder.join(file::LINSEARCH_TRACE)), + true, + Some(output_folder.join(file::trace::GLOBAL)), + Some(output_folder.join(file::trace::DETAIL)), ), }; Ok(Self { @@ -203,19 +223,26 @@ impl BfgsConfig { .transpose()?, wolfe: wolfe_search.try_into()?, small_step: step_size_threshold, - main_trace_path, - linsearch_trace_path, - // Unexposed: the meaning of variables is bound to the program internals. - // /!\ Order here must match order within the optimised tensors. - variable_names: Some({ - let mut names = ["theta", "tau_1", "delta_tau", "gf", "ac", "bc", "ancient"] - .iter() - .map(ToString::to_string) - .collect::<Vec<_>>(); - for i in 0..n_gf_times { - names.push(format!("gf_time_{i}")); + log: log.then(|| { + bfgs::Log { + folder: output_folder.to_path_buf(), + id, + main, + linsearch, + // Unexposed: the meaning of variables is bound to the program internals. + // /!\ Order here must match order within the optimised tensors. + variable_names: Some({ + let mut names = + ["theta", "tau_1", "delta_tau", "gf", "ac", "bc", "ancient"] + .iter() + .map(ToString::to_string) + .collect::<Vec<_>>(); + for i in 0..n_gf_times { + names.push(format!("gf_time_{i}")); + } + names + }), } - names }), }) } @@ -300,87 +327,212 @@ impl TryFrom<&raw::WolfeSearchConfig> for WolfeConfig { //-------------------------------------------------------------------------------------------------- // Check initial parameters. -impl Parameters<f64> { - fn try_from( - init: &raw::InitialParameters, - raw_gft: &raw::GeneFlowTimes, - ) -> Result<Self, Error> { - use Ordering as O; - let &raw::InitialParameters { - theta, - tau_1, - tau_2, - p_ab, - p_ac, - p_bc, - p_ancient_gf, - } = init; - - macro_rules! check_positive { - ($($name:ident),+) => {$({ - let value = init.$name; - ensure!(value >= 0., - err!(( - "Model parameter '{}' must be positive, not {value}.", - stringify!($name) - )) - ) - })+}; +#[allow(clippy::too_many_lines)] // Lot of checking, difficult to disentangle. +#[allow(clippy::similar_names)] // (p_ab, p_bc, p_ac) +fn try_from_parameters( + init: &raw::Init, + raw_gft: &raw::GeneFlowTimes, +) -> Result<Vec<Parameters<Option<f64>>>, Error> { + use ListOrScalar as LS; + + // Scroll the whole list of user-set parameters from left to right, + // checking every 'vertical slice' of it as a separate set of parameters. + // The parameters not set become 'None' for all slices. + // The parameters set to scalars are replicated among all slices. + // The parameters set to arrays are checked for being all the same size. + + // Produce a values generator based on the three possible options + // (+ the number of values to be yielded for arrays) + fn to_generator<'r>( + ls: Option<&'r LS<f64>>, + ) -> (Option<usize>, Box<dyn FnMut() -> Option<f64> + 'r>) { + match ls { + None => (None, Box::new(|| None)), + Some(&LS::Scalar(value)) => (None, Box::new(move || Some(value))), + Some(LS::List(vec)) => { + let mut it = vec.iter().copied(); + (Some(vec.len()), Box::new(move || it.next())) + } + } + } + + let raw::Init { + theta, + tau_1, + tau_2, + p_ab, + p_ac, + p_bc, + p_ancient_gf, + } = init; + + // Call for each parameter, + // collecting the number of values to be yielded for each. + let mut n_values = HashMap::new(); + let mut collect = |ls, name| { + let (n, mut res) = to_generator(ls); + if let Some(n) = n { + n_values.insert(n, name); + } + move || (res(), name) + }; + + let mut theta = collect(Some(theta), "theta"); + let mut tau_1 = collect(tau_1.as_ref(), "tau_1"); + let mut tau_2 = collect(tau_2.as_ref(), "tau_2"); + let mut p_ab = collect(p_ab.as_ref(), "p_ab"); + let mut p_ac = collect(p_ac.as_ref(), "p_ac"); + let mut p_bc = collect(p_bc.as_ref(), "p_bc"); + let mut p_ancient_gf = collect(p_ancient_gf.as_ref(), "p_ancient_gf"); + + // Figure the total number of starting points. + let n_points = match n_values.len() { + 0 => 1, + 1 => n_values.into_iter().next().unwrap().0, + _ => { + let mut it = n_values.into_iter(); + let (a, a_name) = it.next().unwrap(); + let (b, b_name) = it.next().unwrap(); + let s = |n| if n > 1 { "s" } else { "" }; + return ConfigErr { + mess: format!( + "Inconsistent numbers of starting point parameters: \ + {a} value{} provided for {a_name} but {b} value{} for {b_name}.", + s(a), + s(b) + ), + } + .fail()?; } - macro_rules! check_prob { - ($($name:ident),+) => {$({ - let value = init.$name; - ensure!((0. ..=1.).contains(&value), + }; + + // And collect this many "vertically", checking every slice for parameters consistency. + let mut res = Vec::new(); + + // Special checking for gene flow. + let gft: GeneFlowTimes<f64> = raw_gft.try_into()?; + let gft = GeneFlowTimes(gft.0.into_iter().map(Some).collect()); + + for i in 0..n_points { + let index = || format!("[{i}]"); + + let check_positive = |(v, name): (Option<f64>, &str)| -> Result<Option<f64>, Error> { + v.map(|v| { + ensure!( + v > 0., err!(( - "Model parameter '{}' must be a probability (between 0 and 1), \ - not {value}.", - stringify!($name) + "Model parameter {name}{} must be positive, not {v}.", + index(), )) - ) - })+}; - } + ); + Ok(v) + }) + .transpose() + }; + + let check_probability = |(v, name)| -> Result<Option<f64>, Error> { + check_positive((v, name))? + .map(|v| { + ensure!( + v <= 1., + err!(( + "model parameter {name}{} must be a probability \ + (between 0 and 1), not {v}.", + index(), + )) + ); + Ok(v) + }) + .transpose() + }; + + // Individual value test (context-less). + let p = Parameters { + theta: check_positive(theta())?, + tau_1: check_positive(tau_1())?, + tau_2: check_positive(tau_2())?, + p_ab: check_probability(p_ab())?, + p_ac: check_probability(p_ac())?, + p_bc: check_probability(p_bc())?, + p_ancient: check_probability(p_ancient_gf())?, + gf_times: gft.clone(), + }; - check_positive!(theta, tau_1, tau_2); - check_prob!(p_ab, p_ac, p_bc, p_ancient_gf); + // Transversal checks. + tau_values(&p.tau_1, &p.tau_2, index, index)?; + probability_sums([&p.p_ab, &p.p_ac, &p.p_bc], [&index, &index, &index])?; - match tau_2.total_cmp(&tau_1) { + // Correct parameters slice. + res.push(p); + } + + Ok(res) +} + +// Abstract away for reuse when checking mixtures of user-input + default parameters. +pub fn tau_values( + tau_1: &Option<f64>, + tau_2: &Option<f64>, + index_1: impl Fn() -> String, + index_2: impl Fn() -> String, +) -> Result<(), Error> { + use Ordering as O; + if let (Some(tau_1), Some(tau_2)) = (tau_1, tau_2) { + match tau_2.total_cmp(tau_1) { O::Less => { - return err!( - ("The second coalescence time must be older than the first: \ - maybe tau_1 ({tau_1}) and tau_2 ({tau_2}) were meant the other way round?") - ) + return err!(( + "the second coalescence time must be older than the first: \ + here tau_1{i1} < tau_2{i2}: {tau_1} < {tau_2}", + i1 = index_1(), + i2 = index_2(), + )) .fail() } O::Equal => { - return err!( - ("The two coalescence times cannot be identical: \ - here tau_1 == tau_2 == {tau_1}.") - ) + return err!(( + "the two coalescence times cannot be identical: \ + here tau_1{i1} == tau_2{i2} == {tau_1}.", + i1 = index_1(), + i2 = index_2(), + )) .fail() } O::Greater => {} }; + } + Ok(()) +} - let s = p_ab + p_bc + p_ac; - ensure!( - s <= 1.0, - err!( - ("The sum of gene flow transfer probabilities must not be larger than 1. \ - p_ab + p_bc + p_ac = {p_ab} + {p_bc} + {p_ac} = {s} > 1.") - ) - ); - - Ok(Parameters { - theta, - tau_1, - tau_2, - p_ab, - p_ac, - p_bc, - p_ancient: p_ancient_gf, - gf_times: raw_gft.try_into()?, - }) +// Combinations check (not scalable with the number of p_*). +#[allow(clippy::similar_names)] +pub fn probability_sums( + probs: [&Option<f64>; 3], + index: [&dyn Fn() -> String; 3], +) -> Result<(), Error> { + let s: f64 = probs.iter().filter_map(|p| p.as_ref()).sum(); + if s > 1.0 { + let names = ["p_ab", "p_ac", "p_ac"]; + let terms = names + .iter() + .zip(index.iter()) + .zip(probs.iter()) + .filter(|(_, p)| p.is_some()) + .map(|(p, _)| p) + .map(|(name, i)| format!("{name}{}", i())) + .collect::<Vec<_>>() + .join(" + "); + let vterms = probs + .iter() + .filter_map(|p| p.as_ref().map(ToString::to_string)) + .collect::<Vec<_>>() + .join(" + "); + err!( + ("The sum of gene flow transfer probabilities must not be larger than 1. \ + Here {terms} = {vterms} = {s} > 1.") + ) + .fail()?; } + Ok(()) } //-------------------------------------------------------------------------------------------------- @@ -427,7 +579,7 @@ impl TryFrom<&raw::GeneFlowTimes> for GeneFlowTimes<f64> { //-------------------------------------------------------------------------------------------------- impl raw::Taxa { - fn _try_into(&self, interner: &mut Interner) -> Result<Taxa, Error> { + fn expand(&self, interner: &mut Interner) -> Result<Taxa, Error> { // Check taxa consistency. let Self { triplet: raw_triplet, outgroup, other, .. } = self; let triplet: [&str; 3] = raw_triplet diff --git a/src/config/raw.rs b/src/config/raw.rs index e0cc109..f5b4d11 100644 --- a/src/config/raw.rs +++ b/src/config/raw.rs @@ -27,7 +27,7 @@ pub struct Config { /// List the relative dates for possible gene flow events. /// At least one event must be specified, - /// but no more than [`model::parameters::MAX_N_GF_TIMES`]. + /// but no more than [`model::parameters::MAX_N_GF_TIMES`]. /// Dates are relative to the divergence times of the `triplet` species. #[serde(default = "defaults::gf_times")] pub gf_times: GeneFlowTimes, @@ -38,12 +38,12 @@ pub struct Config { /// to exclude discordant scenarios. /// In this situation, /// every scenario contributes to the likelihood - /// instead of only the ones with a concordant topology. + /// instead of only the ones with a concordant topology. /// The possible internal branch discordance /// between actual and expected length /// is neglected because the the actual length is small. /// For this reason, only values inferior - /// to [`config::MAX_UNRESOLVED_LENGTH`] are accepted. + /// to [`config::MAX_UNRESOLVED_LENGTH`] are accepted. /// The value is given in *mutation* units, so branch length × sequence length. pub unresolved_length: Option<f64>, @@ -51,10 +51,8 @@ pub struct Config { /// control whether input trees are dismissed or kept for analysis. pub filters: Option<Filters>, - /// This additional table can be set to specify - /// the parameters starting point for likelihood exploration. - #[serde(default = "defaults::initial_parameters")] - pub init: InitialParameters, + #[serde(default = "defaults::init")] + pub init: Init, /// This additional table can be set for additional control /// over the likelihood exploration. @@ -171,16 +169,23 @@ impl fmt::Debug for NodeInput { //-------------------------------------------------------------------------------------------------- // Statistical model parameters. +type LoS = ListOrScalar<f64>; +type OLoS = Option<ListOrScalar<f64>>; + #[derive(Deserialize)] #[serde(deny_unknown_fields)] -pub struct InitialParameters { - pub theta: f64, - pub tau_1: f64, - pub tau_2: f64, - pub p_ab: f64, - pub p_ac: f64, - pub p_bc: f64, - pub p_ancient_gf: f64, +pub struct Init { + // Only the first field is necessary. + #[serde(default = "defaults::init_thetas")] + pub theta: LoS, + + // Others are optional. + pub tau_1: OLoS, + pub tau_2: OLoS, + pub p_ab: OLoS, + pub p_ac: OLoS, + pub p_bc: OLoS, + pub p_ancient_gf: OLoS, } //-------------------------------------------------------------------------------------------------- @@ -189,10 +194,6 @@ pub struct InitialParameters { #[derive(Deserialize)] #[serde(deny_unknown_fields)] pub struct Search { - #[serde(default = "defaults::init_data_reduction_factor")] - pub init_data_reduction_factor: f64, - #[serde(default = "defaults::init_descent")] - pub init_descent: GdConfig, #[serde(default)] pub bfgs: BfgsConfig, } @@ -246,6 +247,14 @@ pub struct SlopeTrackingConfig { //================================================================================================== // Parsing utils. +// Allow either scalar or vector data input. +#[derive(Deserialize)] +#[serde(untagged)] +pub enum ListOrScalar<T> { + Scalar(T), + List(Vec<T>), +} + // Lists of strings can either be specified as regular arrays like in: // ["a", "b", "c", "d"] // Or as whitespace-separated strings, like in: diff --git a/src/gene_tree/parse.rs b/src/gene_tree/parse.rs index a53f467..6a227e3 100644 --- a/src/gene_tree/parse.rs +++ b/src/gene_tree/parse.rs @@ -22,7 +22,7 @@ struct GeneTreeDataReader<'n> { interner: &'n mut Interner, } -impl<'n> GeneTreeDataReader<'n> { +impl GeneTreeDataReader<'_> { fn read_name(&mut self, name: &str, i_node: usize) -> Result<SpeciesSymbol, LexerError> { let symbol = self.interner.get_or_intern(name); match self.index.entry(symbol) { @@ -39,8 +39,8 @@ impl<'n> GeneTreeDataReader<'n> { } } -impl<'n> TreeDataReader<'_, BranchLength, Option<SpeciesSymbol>, SpeciesSymbol> - for GeneTreeDataReader<'n> +impl TreeDataReader<'_, BranchLength, Option<SpeciesSymbol>, SpeciesSymbol> + for GeneTreeDataReader<'_> { fn parse_branch_data( &mut self, @@ -85,7 +85,7 @@ impl<'n> TreeDataReader<'_, BranchLength, Option<SpeciesSymbol>, SpeciesSymbol> } // Wrap basic tree reader into a gene tree reader. -impl<'i> Lexer<'i> { +impl Lexer<'_> { pub(crate) fn read_gene_tree( &mut self, interner: &mut Interner, diff --git a/src/interner.rs b/src/interner.rs index 739dbb3..76119b9 100644 --- a/src/interner.rs +++ b/src/interner.rs @@ -22,7 +22,7 @@ pub type Interner = StringInterner<BufferBackend<Symbol>>; // would make the output hard or impossible to read. pub struct ResolvedSymbol<'i>(&'i str); -impl<'i> fmt::Debug for ResolvedSymbol<'i> { +impl fmt::Debug for ResolvedSymbol<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_fmt(format_args!(":{}", self.0)) } diff --git a/src/io.rs b/src/io.rs index ecb6b4b..53775d4 100644 --- a/src/io.rs +++ b/src/io.rs @@ -9,8 +9,11 @@ use std::{ string::FromUtf8Error, }; +use serde::Serialize; use snafu::{ResultExt, Snafu}; +use crate::output; + // Read into owned data, emiting dedicated crate errors on failure. pub(crate) fn read_file(path: &Path) -> Result<String, Error> { let tobuf = || -> PathBuf { path.into() }; @@ -27,6 +30,26 @@ pub fn canonicalize(path: &Path) -> Result<PathBuf, Error> { .with_context(|_| CanonicalizeErr { path }) } +pub fn create_file(path: &Path, name: &str) -> Result<File, Error> { + File::create(path).with_context(|_| AtErr { ctx: format!("creating {name} file"), path }) +} + +// Create, write and flush with harmonized error handling. +#[macro_export] +macro_rules! write_file { + ($path:expr, $name:expr, $($content:tt)+) => {{ || -> Result<(), $crate::io::Error> { + let (path, name) = ($path, $name); + let mut file = $crate::io::create_file(path, name)?; + write!(file, $($content)+) + .with_context(|_| $crate::io::AtErr { ctx: format!("writing to {name} file"), path })?; + file.flush().with_context( + |_| $crate::io::AtErr { ctx: format!("flushing {name} file"), path}, + )?; + Ok(()) + }()}}; +} +pub use write_file; + // Display canonicalized paths relatively to current directory. pub trait ToRelative { fn to_relative(&self) -> Result<Cow<Path>, Error>; @@ -42,21 +65,38 @@ impl ToRelative for Path { } } -#[derive(Debug, Snafu)] +#[derive(Debug, Snafu, Serialize)] #[snafu(context(suffix(Err)))] pub enum Error { #[snafu(display("IO error while reading file {}:\n{source}", path.display()))] - Io { path: PathBuf, source: io::Error }, + Io { + path: PathBuf, + #[serde(serialize_with = "output::ser_display")] + source: io::Error, + }, #[snafu(display("UTF-8 encoding error in file {}:\n{source}", path.display()))] Utf8 { path: PathBuf, + #[serde(serialize_with = "output::ser_display")] source: FromUtf8Error, }, #[snafu(display("Could not canonicalize path: {:?}:\n{source}", path.display()))] Canonicalize { + #[serde(serialize_with = "output::ser_display")] + source: std::io::Error, + path: PathBuf, + }, + #[snafu(display("Could not locate current directory: {source}"))] + CurDir { + #[serde(serialize_with = "output::ser_display")] source: std::io::Error, + }, + #[snafu(display("Error while {ctx} at {}: {source}", path.display()))] + #[snafu(visibility(pub))] + At { + ctx: String, path: PathBuf, + #[serde(serialize_with = "output::ser_display")] + source: std::io::Error, }, - #[snafu(display("Could not locate current directory: {source:?}"))] - CurDir { source: std::io::Error }, } diff --git a/src/learn.rs b/src/learn.rs index 02ec07a..8e7f279 100644 --- a/src/learn.rs +++ b/src/learn.rs @@ -5,41 +5,152 @@ // and 'P' the 'parameters' to optimize, // and that we wish to derive F with respect to. -use color_print::cprintln; -use snafu::{ensure, Snafu}; +use std::{cell::LazyCell, fmt::Write}; + +use color_print::{cformat, cprintln}; +use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; +use serde::Serialize; +use snafu::{ResultExt, Snafu}; use tch::{Device, Tensor}; use crate::{ config::Search, + gene_tree::TripletTopology, io::{self, ToRelative}, model::{ - likelihood::{data_tensors, ln_likelihood_tensors}, + likelihood::{self, data_tensors, ln_likelihood_tensors}, parameters::GeneFlowTimes, scenarios::Scenario, - scores::Scores, + scores::{self, Scores}, }, - optim::{Error as OptimError, OptimResult, OptimTensor}, - GeneTriplet, Parameters, + optim::{self, OptimResult}, + output, GeneTriplet, Parameters, }; +// One search success/failure status. +pub type Status = Result<BestFound, optim::Error>; + // Return both optimized inputs and output. -#[allow(clippy::similar_names)] // p_ab/p_ac is okay. -#[allow(clippy::too_many_lines)] // TODO: Fix once stabilized. +#[derive(Debug, Serialize)] +pub struct BestFound { + pub parameters: Parameters<f64>, + #[serde(serialize_with = "scores::ser")] + pub scores: Scores<f64>, + #[serde(serialize_with = "scores::ser")] + pub gradient: Scores<Option<f64>>, + pub ln_likelihood: f64, + pub n_evaluations: u64, + pub n_differentiations: u64, +} + +// Aphid procedure to calculate a meaningful exploration starting point. +#[allow(clippy::similar_names)] // (ab, ac, bc) +pub fn heuristic_starting_point( + triplets: &[GeneTriplet], + theta: f64, + gf_times: &GeneFlowTimes<f64>, +) -> Parameters<f64> { + use TripletTopology as T; + #[allow(clippy::cast_precision_loss)] // TODO: fail in case of precision loss? + let (n_triplets, n_gf_times) = (triplets.len() as f64, gf_times.0.len() as f64); + + // Sum triplet branches lengths among the forest. + let mut lengths = [0; 4]; + // Count discordant triplets. + let [mut n_ac, mut n_bc] = [0; 2]; + for t in triplets { + for (l, sum) in t.local.branches_lengths.iter().zip(lengths.iter_mut()) { + *sum += l; + } + match t.local.topology { + T::ABC => {} + T::ACB => n_ac += 1, + T::BCA => n_bc += 1, + } + } + // Turn into means. + let mean = |s| f64::from(s) / n_triplets; + let [m_a, m_b, m_c, m_d] = lengths.map(mean); + let [f_ac, f_bc] = [n_ac, n_bc].map(mean); + + // Apply formulae. + let tau_1 = (m_a + m_b) * theta / 4.; + let tau_2 = (m_c + m_d) * theta / 4. + tau_1; + let p_ils = likelihood::p_ils(theta, tau_1, tau_2); + let p_ac = f_ac - p_ils / 3.; + let p_bc = f_bc - p_ils / 3.; + let p_ab = (p_ac + p_bc) / 2.; + let p_ancient = 1. / n_gf_times; + + Parameters { + theta, + tau_1, + tau_2, + p_ab, + p_ac, + p_bc, + p_ancient, + gf_times: gf_times.clone(), + } +} + +// Spin several optimisation threads, +// gathering either optimized results or errors. pub fn optimize_likelihood( + triplets: &[GeneTriplet], + starts: &[Parameters<f64>], + search: &[Search], +) -> Result<Vec<Status>, Error> { + println!("Launch {} independent optimisation searches:", starts.len()); + let statuses = starts + .par_iter() + .zip(search.par_iter()) + .map(|(p, s)| -> Result<Status, Error> { + let status = optimize_likelihood_single(triplets, p, s)?; + // Write status in adacent file. + if let Some(log) = &s.bfgs.log { + let mut path = log.folder.clone(); + path.push(output::file::trace::STATUS); + let file = io::create_file(&path, "status")?; + serde_json::to_writer_pretty(file, &status) + .with_context(|_| SerErr { what: "optimisation status" })?; + } + Ok(status) + }) + // There are two nested levels of "errors" here: + // abort early in case of e.g. i/o error to interrupt the program, + // but keep going in case of search errors + // that need to be displayed as a regular search 'status'. + .collect::<Result<Vec<Status>, Error>>()?; + + // Still, fail if *all* statuses are 'failed'. + if statuses.iter().all(Result::is_err) { + let mut list = String::new(); + for (start, status) in starts.iter().zip(statuses.into_iter()) { + let e = status.unwrap_err(); + writeln!(list, "---\n{start:?}\n ↓\n{e}").unwrap(); + } + return AllFailedErr { list }.fail(); + } + + Ok(statuses) +} + +// Optimize from one single starting point. +pub fn optimize_likelihood_single( // Receive in this format for better locality. triplets: &[GeneTriplet], start: &Parameters<f64>, search: &Search, -) -> Result<(Parameters<f64>, f64), Error> { +) -> Result<Status, Error> { // Tensors are small and control flow depends a lot on their values, // so roundtrips to a Gpu are not exactly interesting // in terms of performances. let device = Device::Cpu; - let n_triplets = triplets.len(); // Extract parameters tensors 'P', tracked for the gradient. // Some few "fixed parameters", part of X, can be retrieved within the scores. - let (mut p, scores) = leaves_tensors(start, device); + let (p, scores) = leaves_tensors(start, device); // Extract all remaining data 'X' from the triplets. let ngf = start.n_gf_times(); @@ -47,146 +158,55 @@ pub fn optimize_likelihood( let n_scenarios = scenarios.len(); let x = data_tensors(triplets, n_scenarios, &scenarios, device); - let mut n_eval = 0; - let mut n_diff = 0; - - // Check starting point. - let first_lnl = ln_likelihood_tensors(&x, &scores(&p)).to_double(); - if !first_lnl.is_finite() { - // When there are a lot of data, - // the starting point parameters 'P0' may lead - // to numerically non-finite likelihood values. - // If this happens, perform the optimization - // on a smaller batch of the data first until we get finite likelihood. - let sc: Scores<f64> = (&scores(&p)).into(); - cprintln!( - "<s>--</> The chosen starting point yields \ - non-finite log-likelihood (<s>{first_lnl}</>) \ - on the whole dataset (<s>{n_triplets}</> triplets):\n<k>{sc}</>\n<k>{start:?}</>", - ); - - // Numerical conversion shenanigans to divide dataset by a floating factor. - let reduce = |n: usize| { - let n: u64 = n as u64; - #[allow(clippy::cast_precision_loss)] - let n: f64 = n as f64; - let r = n / search.init_data_reduction_factor; - #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] - let r: usize = r as usize; - r - }; + // Learn over the whole dataset. + let f = |p: &Tensor| -ln_likelihood_tensors(&x, &scores(p)); - // Loop to search a starting point that works for all data. - 'p: loop { - let mut n_samples = reduce(triplets.len()); - let (mut sub_triplets, mut sub_x); - // Loop to search a data subset that works for the current starting point. - 'x: loop { - ensure!( - n_samples > 0, - NonFiniteLikelihoodErr { lnl: first_lnl, parms: start.clone() } - ); - cprintln!( - " Try obtaining finite log-likelihood \ - with the <s>{n_samples}</> first triplets." - ); - sub_triplets = &triplets[0..n_samples]; - sub_x = data_tensors(sub_triplets, n_scenarios, &scenarios, device); - let lnl = ln_likelihood_tensors(&sub_x, &scores(&p)).to_double(); - n_eval += 1; - if !lnl.is_finite() { - cprintln!(" Failure: obtained log-likelihood: <s>{lnl}</>."); - // Reduce the data again. - n_samples = reduce(n_samples); - continue 'x; - } - // With this working (sub_X, P), perform a learning pass - // to produce a (hopefully better) candidate P. - cprintln!( - " Success: obtained log-likelihood: <s>{lnl}</>.\n\ - <s>--</> Learn on this subsample with simple gradient descent." - ); - let f = |p: &Tensor| -ln_likelihood_tensors(&sub_x, &scores(p)); - match search.init_descent.minimize(f, &p, 0) { - Err(e) => { - cprintln!(" Learning failed:\n<k>{e}</>"); - n_samples /= 2; - continue 'x; - } - Ok(opt) => { - n_eval += opt.n_eval(); - n_diff += opt.n_diff(); - p = opt.best_vars().copy(); - let sc: Scores<f64> = (&scores(&p)).into(); - let pm: Parameters<f64> = (&sc).into(); - cprintln!( - " Success: parameters learned \ - on the subsample:\n<k>{sc}</>\n<k>{pm:?}</>" - ); - break 'x; - } - } - } - cprintln!("<s>--</> Try with this new starting point."); - let lnl = ln_likelihood_tensors(&x, &scores(&p)).to_double(); - n_eval += 1; - if !lnl.is_finite() { - cprintln!( - " Failure: obtained non-finite log-likelihood again (<s>{lnl}</>). \ - Try subsampling again from the new starting point." - ); - continue 'p; - } + let prefix = LazyCell::new(|| cformat!("<s>{}::</> ", search.bfgs.log.as_ref().unwrap().id)); + if let Some(log) = &search.bfgs.log { + if log.main.is_some() { cprintln!( - " Success: obtained finite log-likelihood \ - on the whole dataset (<s>{lnl}</>).\n\ - Start learning from this new starting point." + "{}Recording BFGS search at <b>{}</>.", + *prefix, + log.folder.to_relative()?.display() ); - break 'p; } } - // Learn over the whole dataset. - let f = |p: &Tensor| -ln_likelihood_tensors(&x, &scores(p)); - - if let Some(main) = &search.bfgs.main_trace_path { - cprintln!( - " Recording main BFGS search trace at <b>{}</>.", - main.to_relative()?.display() - ); - } - if let Some(lins) = &search.bfgs.linsearch_trace_path { - cprintln!( - " Recording detailed Wolfe linear search traces at <b>{}</>.", - lins.to_relative()?.display() - ); - } // Log every step if a file was provided. - let opt = Box::new(search.bfgs.minimize(f, &p)?); - n_eval += opt.n_eval(); - n_diff += opt.n_diff(); + let opt = match search.bfgs.minimize(f, &p) { + Ok(o) => o, + Err(e) => { + // Failing to optimize is a valid status. + cprintln!("<k>{}{e}</>", *prefix); + cprintln!("{}<y>🗙</> Heuristic failed.", *prefix); + return Ok(Err(e)); + } + }; // Final value. - let p = opt.best_vars().set_requires_grad(true); - let loss = f(&p); + let vars = opt.best_vars().set_requires_grad(true); + let loss = f(&vars); loss.backward(); - let grad = p.grad(); - let scores = scores(&p); - let grad = scores.with_grads(&grad); - cprintln!( - "<s>--</> Terminate heuristic after \ - <g>{n_eval}</> evaluations and \ - <g>{n_diff}</> differentiations." - ); + let grad = vars.grad(); + let scores = scores(&vars); + let valgrads = scores.with_grads(&grad); cprintln!( - "<s>--</> Final location: <k,i>{}value 'gradient{}</>:\n{grad:#}", - '<', - '>' + "{}<g>✓</> Terminate heuristic after \ + <g>{}</> evaluations and \ + <g>{}</> differentiations.", + *prefix, + opt.n_eval(), + opt.n_diff(), ); - let parms: Parameters<f64> = (&scores).into(); - let opt_lnl = -opt.best_loss(); - Ok((parms, opt_lnl)) + Ok(Ok(BestFound { + parameters: (&scores).into(), + scores: (&scores).into(), + gradient: valgrads.only_grads(), + ln_likelihood: -opt.best_loss(), + n_evaluations: opt.n_eval(), + n_differentiations: opt.n_diff(), + })) } // Construct all gradient-tracked parameters, @@ -259,7 +279,14 @@ pub enum Error { ))] NonFiniteLikelihood { lnl: f64, parms: Parameters<f64> }, #[snafu(transparent)] - Optim { source: OptimError }, + Optim { source: optim::Error }, #[snafu(transparent)] - IO { source: io::Error }, + Io { source: io::Error }, + #[snafu(display("Error while serializing {what}: {source}."))] + Ser { + what: String, + source: serde_json::Error, + }, + #[snafu(display("All searches failed:\n{list}"))] + AllFailed { list: String }, } diff --git a/src/lexer.rs b/src/lexer.rs index bbeae18..587c0c8 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -122,7 +122,7 @@ impl<'i> Lexer<'i> { //-------------------------------------------------------------------------------------------------- #[cfg(debug_assertions)] -impl<'i> Lexer<'i> { +impl Lexer<'_> { // Debug util to see the next few chars about to be consumed by the lexer. #[allow(dead_code)] fn display_peek(&self, n: usize) { diff --git a/src/lib.rs b/src/lib.rs index 46889c8..b3d9def 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,7 +7,7 @@ pub mod it_mean; pub mod learn; mod lexer; pub mod model; -mod optim; +pub mod optim; pub mod output; mod tree; diff --git a/src/model/likelihood.rs b/src/model/likelihood.rs index 1a352cb..39cdd6f 100644 --- a/src/model/likelihood.rs +++ b/src/model/likelihood.rs @@ -66,15 +66,29 @@ impl GeneTriplet { // Expressions for probability of having ILS and of not having gene flow. // Use macro to ease abstraction over both floats and tensors. +macro_rules! p_ils { + ($theta:expr, $tau_1:expr, $tau_2:expr, $Ty:ty) => {{ + let (theta, tau_1, tau_2) = ($theta, $tau_1, $tau_2); + let p_ils = ((2.0 * (tau_1 - tau_2) / theta) as $Ty).exp(); + p_ils + }}; +} macro_rules! probabilities_ils_nongf { ($parameters:expr, $Ty:ty) => {{ let Parameters { theta, tau_1, tau_2, p_ab, p_ac, p_bc, .. } = $parameters; - let p_ils = ((2.0 * (tau_1 - tau_2) / theta) as $Ty).exp(); + let p_ils = p_ils!(theta, tau_1, tau_2, $Ty); let p_ngf = 1. - p_ab - p_ac - p_bc; [p_ils, p_ngf] }}; } +pub(crate) fn p_ils(theta: f64, tau_1: f64, tau_2: f64) -> f64 { + p_ils!(theta, tau_1, tau_2, f64) +} +pub(crate) fn p_ils_tensor(theta: Tensor, tau_1: Tensor, tau_2: Tensor) -> Tensor { + p_ils!(theta, tau_1, tau_2, Tensor) +} + // (not actually used in this implementation) impl Parameters<f64> { fn _probabilities_ils_nongf(&self) -> [f64; 2] { diff --git a/src/model/parameters.rs b/src/model/parameters.rs index d584923..21b468e 100644 --- a/src/model/parameters.rs +++ b/src/model/parameters.rs @@ -5,7 +5,7 @@ use std::fmt::{self, Display}; use arrayvec::ArrayVec; use color_print::{cwrite, cwriteln}; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use tch::Tensor; // The structure is generic among float and tensors @@ -25,27 +25,27 @@ impl Num for ValGrad {} // - tau_1 <= tau_2 // - p_* <= 1 // - p_ab + p_ac + p_bc <= 1 -#[derive(Debug, Clone, Serialize, Default)] -pub struct Parameters<F: Num> { +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct Parameters<F> { // Population size. - pub(crate) theta: F, + pub theta: F, // Divergence times. - pub(crate) tau_1: F, - pub(crate) tau_2: F, + pub tau_1: F, + pub tau_2: F, // Gene flow. - pub(crate) p_ab: F, - pub(crate) p_ac: F, - pub(crate) p_bc: F, - pub(crate) p_ancient: F, // Only the most ancient gf time gets a special probability. + pub p_ab: F, + pub p_ac: F, + pub p_bc: F, + pub p_ancient: F, // Only the most ancient gf time gets a special probability. // Date of inferred gene flow events. - pub(crate) gf_times: GeneFlowTimes<F>, // At least one value, sorted decreasing, all positives. + pub gf_times: GeneFlowTimes<F>, // At least one value, sorted decreasing, all positives. } // Parameters are stack-allocated, so don't overuse possible GF times. pub const MAX_N_GF_TIMES: usize = 3; -#[derive(Serialize, Debug, Clone, PartialEq, Eq, Default)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] #[serde(transparent)] -pub struct GeneFlowTimes<F: Num>(pub(crate) ArrayVec<F, MAX_N_GF_TIMES>); +pub struct GeneFlowTimes<F>(pub(crate) ArrayVec<F, MAX_N_GF_TIMES>); // Convenience bundle. pub(crate) struct ValGrad { @@ -62,6 +62,12 @@ impl<F: Num> GeneFlowTimes<F> { } } +impl<I> GeneFlowTimes<I> { + pub fn map<O>(&self, f: impl Fn(&I) -> O) -> GeneFlowTimes<O> { + GeneFlowTimes(self.0.iter().map(f).collect()) + } +} + impl<F: Num> Parameters<F> { pub(crate) fn n_gf_times(&self) -> usize { self.gf_times.0.len() diff --git a/src/model/scores.rs b/src/model/scores.rs index fa1cfe7..b49ced0 100644 --- a/src/model/scores.rs +++ b/src/model/scores.rs @@ -7,6 +7,7 @@ use std::fmt::{self, Display}; use arrayvec::ArrayVec; +use serde::{ser::SerializeStruct, Serialize, Serializer}; use tch::{Device, Kind, Tensor}; use crate::{ @@ -14,18 +15,18 @@ use crate::{ Parameters, }; -#[derive(PartialEq, Eq)] -pub(crate) struct Scores<F: Num> { - pub(crate) theta: F, // Encodes theta. - pub(crate) tau_1: F, // Encodes tau_1. - pub(crate) delta_tau: F, // Encodes positive difference between tau_2 and tau_1. - pub(crate) gf: F, // Encodes p_ab + p_ac + p_bc. +#[derive(Debug, PartialEq, Eq)] +pub struct Scores<F> { + pub theta: F, // Encodes theta. + pub tau_1: F, // Encodes tau_1. + pub delta_tau: F, // Encodes positive difference between tau_2 and tau_1. + pub gf: F, // Encodes p_ab + p_ac + p_bc. // (implicit) ab = 1, // Weigths p_ab. - pub(crate) ac: F, // Weights p_ac. - pub(crate) bc: F, // Weights p_bc. - pub(crate) ancient: F, // Encodes p_ancient. + pub ac: F, // Weights p_ac. + pub bc: F, // Weights p_bc. + pub ancient: F, // Encodes p_ancient. // (implicit) max_gft = 1, - pub(crate) gf_times: GeneFlowTimes<F>, // Encode fraction of max_gft or previous greater gft. + pub gf_times: GeneFlowTimes<F>, // Encode fraction of max_gft or previous greater gft. } // Implicit constants. const SCORE_AB: f64 = 1.0; @@ -260,6 +261,13 @@ impl Scores<Tensor> { } } +impl Scores<ValGrad> { + pub(crate) fn only_grads(&self) -> Scores<Option<f64>> { + let getgrad = |vg: &ValGrad| vg.gradient; + into_scores!(self, getgrad) + } +} + // Drop gradient information. impl From<&Scores<ValGrad>> for Scores<f64> { fn from(scores: &Scores<ValGrad>) -> Self { @@ -327,6 +335,42 @@ impl<F: Num + Display + fmt::Debug> fmt::Display for Scores<F> { } } +//================================================================================================== +// Serde. + +pub(crate) fn ser<S: Serializer, F: Serialize>( + scores: &Scores<F>, + ser: S, +) -> Result<S::Ok, S::Error> { + let Scores { + theta, + tau_1, + delta_tau, + gf, + ac, + bc, + ancient, + gf_times, + } = scores; + let mut s = ser.serialize_struct("Scores", 8)?; + macro_rules! serfields { + ($($id:ident)+) => {$( + s.serialize_field(stringify!($id), $id)?; + )+}; + } + serfields! { + theta + tau_1 + delta_tau + gf + ac + bc + ancient + gf_times + }; + s.end() +} + //================================================================================================== // Test. diff --git a/src/optim.rs b/src/optim.rs index 8b31ca1..63e34bc 100644 --- a/src/optim.rs +++ b/src/optim.rs @@ -10,6 +10,7 @@ pub(crate) mod bfgs; pub(crate) mod gd; +pub(crate) mod io; pub(crate) mod tensor; pub(crate) mod wolfe_search; @@ -197,15 +198,29 @@ impl<'c> SlopeTracker<'c> { // Not exactly sure how to best handle errors polymorphism among optimizers? // Here, an explicit list of implementors is required. -#[derive(Debug, Snafu)] +#[derive(Debug, Snafu, Serialize)] #[snafu(context(suffix(Err)))] pub enum Error { #[snafu(display("Configuration:\n{mess}"))] - Config { mess: String }, + Config { + #[serde(flatten)] + mess: String, + }, #[snafu(display("Gradient descent failure:\n{source}"))] - Gd { source: gd::Error }, + Gd { + #[serde(flatten)] + source: gd::Error, + }, #[snafu(display("Failure of BFGS algorithm:\n{source}"))] - Bfgs { source: bfgs::Error }, + Bfgs { + #[serde(flatten)] + source: bfgs::Error, + }, + #[snafu(transparent)] + Io { + #[serde(flatten)] + source: io::Error, + }, } macro_rules! cerr { @@ -247,7 +262,7 @@ mod tests { use rand_distr::StandardNormal; use tch::{Device, Kind, Tensor}; - use super::wolfe_search::Config as WolfeSearchConfig; + use super::{bfgs::Log, wolfe_search::Config as WolfeSearchConfig}; use crate::optim::{ bfgs::Config as BfgsConfig, gd::Config as GdConfig, History, OptimResult, OptimTensor, SlopeTrackingConfig, @@ -298,9 +313,13 @@ mod tests { threshold: 1e-3, grain: 5, }), - main_trace_path: Some(PathBuf::from("./target/bfgs.csv")), - linsearch_trace_path: Some(PathBuf::from("./target/bfgs_linsearch.csv")), - variable_names: None, + log: Some(Log { + id: "test".into(), + folder: "./target".into(), + main: Some(PathBuf::from("./target/bfgs.csv")), + linsearch: Some(PathBuf::from("./target/bfgs_linsearch.csv")), + variable_names: None, + }), } } diff --git a/src/optim/bfgs.rs b/src/optim/bfgs.rs index ed71ade..6f5acc5 100644 --- a/src/optim/bfgs.rs +++ b/src/optim/bfgs.rs @@ -3,23 +3,25 @@ // Stops on its own when the mean slope of the history // becomes flat, provided the history is full. -use std::{ - fs::File, - io::{self, Write}, - path::PathBuf, -}; +use std::{cell::LazyCell, io::Write, path::PathBuf}; +use color_print::{cformat, cprintln}; use serde::Serialize; use snafu::{ensure, ResultExt, Snafu}; use tch::Tensor; -use crate::optim::{ - simple_optim_result_impl, - wolfe_search::{ - self, Config as WolfeSearchConfig, Error as WolfeSearchError, LastStep, Location, - Summary as WolfeSearchSummary, +use crate::{ + optim::{ + self, + io::{self, TraceFile}, + simple_optim_result_impl, tensor, + wolfe_search::{ + self, Config as WolfeSearchConfig, Error as WolfeSearchError, LastStep, Location, + Summary as WolfeSearchSummary, + }, + Best, Error as OptimError, OptimTensor, SlopeTracker, SlopeTrackingConfig, }, - Best, Error as OptimError, OptimTensor, SlopeTracker, SlopeTrackingConfig, + output, }; // The exposed config. @@ -34,9 +36,16 @@ pub(crate) struct Config { // In this situation, consider that the search is over // if the step norm was shorter than this threshold value. pub(crate) small_step: f64, - // Logging. - pub(crate) main_trace_path: Option<PathBuf>, - pub(crate) linsearch_trace_path: Option<PathBuf>, + // Where and whether to log search traces. + pub(crate) log: Option<Log>, +} + +#[derive(Debug, Serialize)] +pub(crate) struct Log { + pub(crate) id: String, // Useful for console output in parallel execution context. + pub(crate) folder: PathBuf, + pub(crate) main: Option<PathBuf>, + pub(crate) linsearch: Option<PathBuf>, pub(crate) variable_names: Option<Vec<String>>, } @@ -86,8 +95,8 @@ struct BfgsSearch<'c, F: Fn(&Tensor) -> Tensor> { slope_tracker: Option<SlopeTracker<'c>>, // Log files to write to. - main_file: Option<File>, // Main search steps. - lin_file: Option<File>, // Linear search on every step. + main_file: Option<TraceFile<'c>>, // Main search steps. + lin_file: Option<TraceFile<'c>>, // Linear search on every step. } impl Config { @@ -134,13 +143,15 @@ impl Config { t }); - let create_file = |path: &Option<PathBuf>| -> Result<_, Error> { - Ok(if let Some(path) = path { - Some(File::create(path).context(TraceErr { path })?) - } else { - None - }) - }; + macro_rules! trace_file { + ($file:ident) => { + self.log + .as_ref() + .map(|log| log.$file.as_ref().map(|path| TraceFile::new(path))) + .flatten() + .transpose()? + }; + } // Spin actual search steps. let mut search = BfgsSearch { @@ -155,8 +166,8 @@ impl Config { n_steps: 0, n_eval: 1, n_diff: 1, - main_file: create_file(&self.main_trace_path)?, - lin_file: create_file(&self.linsearch_trace_path)?, + main_file: trace_file!(main), + lin_file: trace_file!(linsearch), cf: self, slope_tracker, best, @@ -169,22 +180,28 @@ impl Config { impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { fn run_search(&mut self) -> Result<BfgsResult, Error> { let cf = self.cf; + let prefix = LazyCell::new(|| { + if let Some(log) = &self.cf.log { + cformat!("<s>{}::(BFGS)</> ", log.id) + } else { + String::new() + } + }); if cf.max_iter == 0 { - println!("No optimisation step asked for."); + cprintln!("<k>{}No optimisation step asked for.</>", *prefix); return Ok(self.result()); } - let log = cf.main_trace_path.is_some(); - if log { - self.write_header()?; - if cf.linsearch_trace_path.is_some() { - wolfe_search::write_header( - self.lin_file.as_mut().unwrap(), - self.cf.variable_names.as_ref().ok_or(self.n_vars), - )?; - } - self.log(0.)?; // Fake initial zero step. - } + self.write_header()?; + if let Some(lin) = &mut self.lin_file { + wolfe_search::write_header( + lin, + cf.log + .as_ref() + .and_then(|log| log.variable_names.as_ref()) + .ok_or(self.n_vars), + )?; + }; loop { // Pick a search direction. @@ -195,7 +212,11 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { Location::new(&self.fun, &self.dir, &self.vars, self.loss, &self.grad), &self.cf.wolfe, &mut self.best, - (self.lin_file.as_mut(), self.n_steps), + ( + cf.log.as_ref().map(|l| l.id.as_str()), + self.lin_file.as_mut(), + self.n_steps, + ), )?; self.n_eval += n_eval; self.n_diff += n_diff; @@ -208,9 +229,10 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { grad_after_step, }) = res else { - println!( - "Reached null step size after {n_steps} steps: \ - {n_eval} evaluations and {n_diff} differentiations." + cprintln!( + "<k>{}Reached null step size after {n_steps} steps: \ + {n_eval} evaluations and {n_diff} differentiations.</>", + *prefix, ); break Ok(self.result()); }; @@ -218,16 +240,18 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { // Check step norm. let norm = step.dot(&step).to_double(); if norm <= 0.0 { - println!( - "Reach silent step after {n_steps} steps: \ - {n_eval} evaluations and {n_diff} differentiations." + cprintln!( + "<k>{}Reach silent step after {n_steps} steps: \ + {n_eval} evaluations and {n_diff} differentiations.</>", + *prefix, ); break Ok(self.result()); } if norm < self.cf.small_step && self.grad == grad_after_step { - println!( - "Reach silent grad step after {n_steps} steps: \ - {n_eval} evaluations and {n_diff} differentiations." + cprintln!( + "<k>{}Reach silent grad step after {n_steps} steps: \ + {n_eval} evaluations and {n_diff} differentiations.</>", + *prefix, ); break Ok(self.result()); } @@ -240,7 +264,10 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { // Check slope. if let Some(ref mut tracker) = self.slope_tracker { if let Some(lowslope) = tracker.low_slope(self.loss, n_steps) { - println!("Weak loss slope ({lowslope:e}) on iteration {n_steps}: stopping."); + cprintln!( + "<k>{}Weak loss slope ({lowslope:e}) on iteration {n_steps}: stopping.</>", + *prefix + ); break Ok(self.result()); } } @@ -256,12 +283,10 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { self.loss = loss_after_step; self.grad = grad_after_step; - if log { - self.log(step_size)?; - } + self.log(step_size)?; if n_steps >= cf.max_iter { - println!("Max iteration reached: {n_steps}."); + cprintln!("<k>{}Max iteration reached: {n_steps}.</>", *prefix); break Ok(self.result()); } } @@ -279,13 +304,19 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { // Traces. fn write_header(&mut self) -> Result<(), Error> { + let Some(TraceFile { path, ref mut file }) = self.main_file else { + return Ok(()); + }; + let Some(log) = &self.cf.log else { + return Ok(()); + }; // Main headers. let mut header = ["step", "loss", "step_size"] .into_iter() .map(ToString::to_string) .collect::<Vec<_>>(); for vec in ["var", "grad", "dir"] { - if let Some(varnames) = &self.cf.variable_names { + if let Some(varnames) = &log.variable_names { for name in varnames { header.push(format!("{vec}_{name}")); } @@ -295,48 +326,62 @@ impl<F: Fn(&Tensor) -> Tensor> BfgsSearch<'_, F> { } } } - let file = self.main_file.as_mut().unwrap(); - let path = self.cf.main_trace_path.as_ref().unwrap(); - writeln!(file, "{}", header.join(",")).with_context(|_| TraceErr { path })?; + writeln!(file, "{}", header.join(",")) + .with_context(|_| io::AtErr { ctx: "writing header line", path })?; Ok(()) } fn log(&mut self, step_size: f64) -> Result<(), Error> { - macro_rules! w { - ($fmt:literal $(, $val:expr)?) => {{ - let file = self.main_file.as_mut().unwrap(); - let path = self.cf.main_trace_path.as_ref().unwrap(); - write!(file, $fmt $(, $val)?).with_context(|_| TraceErr{path}) - }}; - } - w!("{}", self.n_steps)?; - w!(",{}", self.loss)?; - w!(",{}", step_size)?; - for t in [&self.vars, &self.grad, &self.dir] { - for f in t.iter::<f64>().unwrap() { - w!(",{f}")?; + let Some(TraceFile { path, ref mut file }) = self.main_file else { + return Ok(()); + }; + let mut write = || -> Result<(), std::io::Error> { + write!(file, "{}", self.n_steps)?; + write!(file, ",{}", self.loss)?; + write!(file, ",{step_size}")?; + for t in [&self.vars, &self.grad, &self.dir] { + for f in t.iter::<f64>().unwrap() { + write!(file, ",{f}")?; + } } - } - w!("\n") + writeln!(file) + }; + Ok(write().with_context(|_| io::AtErr { ctx: "writing trace file", path })?) } } -#[derive(Debug, Snafu)] +#[derive(Debug, Snafu, Serialize)] #[snafu(context(suffix(Err)))] pub enum Error { - #[snafu(display("Obtained non-finite loss ({loss}) with these variables:\n{vars:?}"))] - NonFiniteLoss { loss: f64, vars: Tensor }, + #[snafu(display( + "Obtained non-finite loss ({loss}) \ + before first step with these variables:\n{vars:?}" + ))] + NonFiniteLoss { + loss: f64, + #[serde(serialize_with = "tensor::ser")] + vars: Tensor, + }, #[snafu(display( "Obtained non-finite gradient for loss ({loss}):\ngradient: {grad:?}\nvariables: {vars:?}" ))] NonFiniteGrad { + #[serde(serialize_with = "tensor::ser")] grad: Tensor, + #[serde(serialize_with = "tensor::ser")] vars: Tensor, loss: f64, }, #[snafu(transparent)] - WolfeSearch { source: WolfeSearchError }, - #[snafu(display("Error writing to trace file {:?}:\n{source}", path.display()))] - Trace { path: PathBuf, source: io::Error }, + WolfeSearch { + #[serde(flatten)] + source: WolfeSearchError, + }, + #[snafu(transparent)] + Io { + #[serde(serialize_with = "output::ser_display")] + #[serde(flatten)] + source: optim::io::Error, + }, } diff --git a/src/optim/gd.rs b/src/optim/gd.rs index 6f17121..15fa85e 100644 --- a/src/optim/gd.rs +++ b/src/optim/gd.rs @@ -10,8 +10,8 @@ use snafu::Snafu; use tch::Tensor; use crate::optim::{ - simple_optim_result_impl, Best, Error as OptimError, Loggable, OptimTensor, SlopeTracker, - SlopeTrackingConfig, + simple_optim_result_impl, tensor, Best, Error as OptimError, Loggable, OptimTensor, + SlopeTracker, SlopeTrackingConfig, }; #[derive(Debug, Serialize)] @@ -153,20 +153,27 @@ impl Config { } } -#[derive(Debug, Snafu)] +#[derive(Debug, Serialize, Snafu)] #[snafu(context(suffix(Err)))] pub enum Error { #[snafu(display( "Obtained non-finite loss ({loss}) \ with these variables on step {step}:\n{vars:?}" ))] - NonFiniteLoss { loss: f64, vars: Tensor, step: u64 }, + NonFiniteLoss { + loss: f64, + #[serde(serialize_with = "tensor::ser")] + vars: Tensor, + step: u64, + }, #[snafu(display( "Obtained non-finite gradient for loss ({loss}) on step {step}:\ \ngradient: {grad:?}\nvariables: {vars:?}" ))] NonFiniteGrad { + #[serde(serialize_with = "tensor::ser")] grad: Tensor, + #[serde(serialize_with = "tensor::ser")] vars: Tensor, loss: f64, step: u64, diff --git a/src/optim/io.rs b/src/optim/io.rs new file mode 100644 index 0000000..742477c --- /dev/null +++ b/src/optim/io.rs @@ -0,0 +1,37 @@ +//! Anything io-related, but contained within this autonomous 'optim' module. +use std::{ + fs::File, + path::{Path, PathBuf}, +}; + +use serde::Serialize; +use snafu::{ResultExt, Snafu}; + +/// Pack open file and its path together, +/// useful for trace files. +pub(crate) struct TraceFile<'p> { + pub(crate) path: &'p Path, + pub(crate) file: File, +} + +impl<'p> TraceFile<'p> { + pub(crate) fn new(path: &'p Path) -> Result<Self, Error> { + Ok(Self { + file: File::create(path).with_context(|_| AtErr { ctx: "creating file", path })?, + path, + }) + } +} + +#[derive(Debug, Snafu, Serialize)] +#[snafu(context(suffix(Err)))] +pub enum Error { + #[snafu(display("Error while {ctx} at {}: {source}", path.display()))] + #[snafu(visibility(pub(crate)))] + At { + ctx: String, + path: PathBuf, + #[serde(serialize_with = "crate::output::ser_display")] + source: std::io::Error, + }, +} diff --git a/src/optim/tensor.rs b/src/optim/tensor.rs index 3da983d..918828e 100644 --- a/src/optim/tensor.rs +++ b/src/optim/tensor.rs @@ -1,5 +1,6 @@ // Convenience trait for tensors, as used in this optimisation module. +use serde::{ser::SerializeSeq, Serializer}; use tch::Tensor; //-------------------------------------------------------------------------------------------------- @@ -78,3 +79,23 @@ impl Loggable for Tensor { res } } + +//-------------------------------------------------------------------------------------------------- +// Serde. + +pub(crate) fn ser<S>(t: &Tensor, s: S) -> Result<S::Ok, S::Error> +where + S: Serializer, +{ + assert!( + t.size().len() == 1, + "Only 1-dimensional tensors can be serialized yet." + ); + let mut seq = s.serialize_seq(Some( + usize::try_from(t.size()[0]).expect("Serializing negative tensor size?"), + ))?; + for ref e in t.iter::<f64>().unwrap() { + seq.serialize_element(e)?; + } + seq.end() +} diff --git a/src/optim/wolfe_search.rs b/src/optim/wolfe_search.rs index 104a50c..454ffc5 100644 --- a/src/optim/wolfe_search.rs +++ b/src/optim/wolfe_search.rs @@ -22,17 +22,18 @@ // If the bracketing phase requires increasing alpha again, // we perform a binary search to find a < a_next < h. -use std::{ - fs::File, - io::{self, Write}, - ops::ControlFlow, -}; +use std::{cell::LazyCell, io::Write, ops::ControlFlow}; +use color_print::{cformat, cprintln}; use serde::Serialize; -use snafu::{ensure, Snafu}; +use snafu::{ensure, ResultExt, Snafu}; use tch::Tensor; -use crate::optim::{tensor::OptimTensor, Best}; +use crate::optim::{ + io::{self, TraceFile}, + tensor::{self, OptimTensor}, + Best, +}; #[derive(Debug, Serialize)] pub(crate) struct Config { @@ -63,7 +64,7 @@ pub(crate) struct Config { pub(crate) bisection_threshold: f64, // (0 ⩽ · ⩽ 1/2) } -pub(crate) struct Search<'c, 's, F: Fn(&Tensor) -> Tensor> { +pub(crate) struct Search<'s, 'c: 's, F: Fn(&Tensor) -> Tensor> { cf: &'c Config, // Borrow situation from the calling search step. @@ -76,9 +77,9 @@ pub(crate) struct Search<'c, 's, F: Fn(&Tensor) -> Tensor> { n_eval: u64, n_diff: u64, - // Log search trace, given a search identifier as first column. - trace: Option<&'s mut File>, - id: u64, + // Log search trace, given a search step number as first column. + trace: Option<&'s mut TraceFile<'c>>, + step_id: u64, n_vars: usize, } @@ -146,7 +147,7 @@ enum BinaryStepDirection { Down, } -impl<'c, 'binsearch, F: Fn(&Tensor) -> Tensor> Search<'c, 'binsearch, F> { +impl<F: Fn(&Tensor) -> Tensor> Search<'_, '_, F> { // Specify Wolfe criteria, assuming loss is finite. // Armijo's rule: check that the current step candidate @@ -463,43 +464,45 @@ impl<'c, 'binsearch, F: Fn(&Tensor) -> Tensor> Search<'c, 'binsearch, F> { x: &Tensor, // short for xs = x + alpha * p grad: Option<&Tensor>, // = ∇f(xs) ) -> Result<(), Error> { - let Some(file) = self.trace.as_mut() else { + let Some(&mut TraceFile { path, ref mut file }) = self.trace else { return Ok(()); }; - macro_rules! w { - ($fmt:literal, $value:expr) => { - write!(file, $fmt, $value)?; - }; - ($fmt:literal if $opt:expr; $alt:literal) => { - if let Some(value) = $opt { - write!(file, $fmt, value)?; + let mut write = || -> Result<(), std::io::Error> { + macro_rules! w { + ($fmt:literal, $value:expr) => { + write!(file, $fmt, $value)?; + }; + ($fmt:literal if $opt:expr; $alt:literal) => { + if let Some(value) = $opt { + write!(file, $fmt, value)?; + } else { + write!(file, $alt)?; + } + }; + } + w!("{}", self.step_id); + w!(",{}", alpha); + w!(",{}", phi); + w!(",{}" if phigs; ","); + for t in [Some(x), grad] { + if let Some(t) = t { + for f in t.iter::<f64>().unwrap() { + w!(",{}", f); + } } else { - write!(file, $alt)?; - } - }; - } - w!("{}", self.id); - w!(",{}", alpha); - w!(",{}", phi); - w!(",{}" if phigs; ","); - for t in [Some(x), grad] { - if let Some(t) = t { - for f in t.iter::<f64>().unwrap() { - w!(",{}", f); - } - } else { - for _ in 0..self.n_vars { - write!(file, ",")?; + for _ in 0..self.n_vars { + write!(file, ",")?; + } } } - } - writeln!(file)?; - Ok(()) + writeln!(file) + }; + Ok(write().with_context(|_| io::AtErr { ctx: "writing to trace file", path })?) } } pub(crate) fn write_header( - file: &mut File, + trace: &mut TraceFile, varnames: Result<&Vec<String>, usize>, // Get at least number if there are no names. ) -> Result<(), Error> { let mut header = ["id", "alpha", "loss", "phigrad"] @@ -520,7 +523,8 @@ pub(crate) fn write_header( } } } - writeln!(file, "{}", header.join(","))?; + writeln!(trace.file, "{}", header.join(",")) + .with_context(|_| io::AtErr { ctx: "writing trace header", path: trace.path })?; Ok(()) } @@ -530,11 +534,11 @@ pub(crate) fn write_header( // and the number of evaluations/differentiations used to find the result. // Don't return values at the point stepped to // if the best step size found is null. -pub(crate) fn search<'s, F: Fn(&Tensor) -> Tensor>( +pub(crate) fn search<'s, 'c: 's, F: Fn(&Tensor) -> Tensor>( loc: Location<'s, F>, - cf: &Config, + cf: &'c Config, best: &'s mut Best, - (trace, id): (Option<&'s mut File>, u64), + (run_id, trace, step_id): (Option<&'s str>, Option<&'s mut TraceFile<'c>>, u64), ) -> Result<Summary, Error> { use Error as E; // Initialize. @@ -547,20 +551,30 @@ pub(crate) fn search<'s, F: Fn(&Tensor) -> Tensor>( n_diff: 0, // Not yet calculated. trace, - id, + step_id, }; + let prefix = LazyCell::new(|| { + if let Some(id) = run_id { + cformat!("<s>{id}::(Wolfe)</> ") + } else { + String::new() + } + }); match search.run_search() { Err(E::DeadBinarySearch { alpha, summary } | E::DeadZoom { alpha, summary }) => { if alpha <= 0. { - println!("Null step size reached: {alpha}."); + cprintln!("<k>{}Null step size reached: {alpha}.</>", *prefix); Ok(summary) } else { - println!("(step {id}) Wolfe range too small for ulps around {alpha}?"); + cprintln!( + "<k>{}(step {step_id}) Wolfe range too small for ulps around {alpha}?</>", + *prefix + ); Ok(summary) } } Err(e @ E::FlatZoom { .. }) => { - println!("{e}"); + cprintln!("<k>{}{e}</>", *prefix); let E::FlatZoom { summary, .. } = e else { unreachable!() }; @@ -570,42 +584,61 @@ pub(crate) fn search<'s, F: Fn(&Tensor) -> Tensor>( } } -#[derive(Debug)] +#[derive(Debug, Serialize)] pub struct Summary { pub(crate) last_step: Option<LastStep>, pub(crate) n_eval: u64, pub(crate) n_diff: u64, } -#[derive(Debug)] +#[derive(Debug, Serialize)] pub(crate) struct LastStep { pub(crate) step_size: f64, + #[serde(serialize_with = "tensor::ser")] pub(crate) step: Tensor, + #[serde(serialize_with = "tensor::ser")] pub(crate) vars_after_step: Tensor, pub(crate) loss_after_step: f64, + #[serde(serialize_with = "tensor::ser")] pub(crate) grad_after_step: Tensor, } -#[derive(Debug, Snafu)] +#[derive(Debug, Snafu, Serialize)] #[snafu(context(suffix(Err)))] pub enum Error { #[snafu(display( "Could not find a step size yielding a finite loss value, \ starting from point:\n{x}\nin direction:\n{p}" ))] - NoStepSizeYieldingFiniteLoss { x: Tensor, p: Tensor }, + NoStepSizeYieldingFiniteLoss { + #[serde(serialize_with = "tensor::ser")] + x: Tensor, + #[serde(serialize_with = "tensor::ser")] + p: Tensor, + }, #[snafu(display( "Could not find a step size meeting Wolfe lower bound, \ starting from point:\n{x}\nin direction:\n{p}" ))] - NoStepLargeEnough { x: Tensor, p: Tensor }, + NoStepLargeEnough { + #[serde(serialize_with = "tensor::ser")] + x: Tensor, + #[serde(serialize_with = "tensor::ser")] + p: Tensor, + }, #[snafu(display("Obtained non-finite loss ({loss}) with these variables:\n{vars:?}"))] - NonFiniteLoss { loss: f64, vars: Tensor }, + NonFiniteLoss { + loss: f64, + #[serde(serialize_with = "tensor::ser")] + vars: Tensor, + }, #[snafu(display( "Obtained non-finite gradient for loss ({loss}):\ngradient: {grad:?}\nvariables: {vars:?}" ))] NonFiniteGrad { + #[serde(serialize_with = "tensor::ser")] grad: Tensor, + #[serde(serialize_with = "tensor::ser")] vars: Tensor, loss: f64, }, @@ -628,5 +661,5 @@ pub enum Error { summary: Summary, }, #[snafu(transparent)] - Trace { source: io::Error }, + Io { source: io::Error }, } diff --git a/src/output.rs b/src/output.rs index 7b7973f..6d291b0 100644 --- a/src/output.rs +++ b/src/output.rs @@ -5,14 +5,50 @@ pub mod csv; pub mod detail; pub mod global; +use std::fmt::Display; + +pub use global::Global; +use serde::Serializer; + // Standard output filenames. pub mod file { + /// The whole configuration used. pub const CONFIG: &str = "config.json"; + + /// Forest-level analysis results. pub const GLOBAL: &str = "global.json"; + + /// Tree-level analysis results. pub const DETAIL: &str = "detail.json"; + + /// Tree-level results in synthetic tabular form. pub const CSV: &str = "trees.csv"; - pub const MAIN_TRACE: &str = "search_global.csv"; - pub const LINSEARCH_TRACE: &str = "search_detail.csv"; + + /// Search traces, one per starting point. + pub mod trace { + /// Traces folder name. + // TODO: optionally (default) fuse with filenames in case there is only 1. + pub const FOLDER: &str = "search"; + + /// Starting point for this trace. + pub const INIT: &str = "init.json"; + + /// BFGS-level trace. + pub const GLOBAL: &str = "global.csv"; + + /// Wolfe search trace. + pub const DETAIL: &str = "detail.csv"; + + /// Search result. + pub const STATUS: &str = "status.json"; + } } -pub use global::Global; +// Serialize anything as its 'display' feature. +pub(crate) fn ser_display<S, D>(d: D, ser: S) -> Result<S::Ok, S::Error> +where + S: Serializer, + D: Display, +{ + ser.serialize_str(&d.to_string()) +} diff --git a/src/output/config.rs b/src/output/config.rs index 83b9b13..4eed68e 100644 --- a/src/output/config.rs +++ b/src/output/config.rs @@ -56,7 +56,13 @@ impl SpeciesTriplet { impl Resolver<'_> { pub fn json(&self) -> JsValue { let Resolver { config, interner } = self; - let Config { trees, taxa, filters, unresolved_length, search } = config; + let Config { + trees, + taxa, + filters, + unresolved_length, + searches: search, + } = config; json!({ "trees": trees, "taxa": taxa.resolve(interner).json(), @@ -105,9 +111,15 @@ impl TripletResolver<'_> { //================================================================================================== // Debug displays (same logic without json for informal use within the program). -impl<'i> fmt::Debug for Resolver<'i> { +impl fmt::Debug for Resolver<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let Config { trees, taxa, filters, unresolved_length, search } = self.config; + let Config { + trees, + taxa, + filters, + unresolved_length, + searches: search, + } = self.config; f.debug_struct("Config") .field("trees", trees) .field("taxa", &taxa.resolve(self.interner)) @@ -118,7 +130,7 @@ impl<'i> fmt::Debug for Resolver<'i> { } } -impl<'i> fmt::Debug for TaxaResolver<'i> { +impl fmt::Debug for TaxaResolver<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let Taxa { triplet, outgroup, other } = self.taxa; let resolve = |slice: &[_]| { @@ -135,7 +147,7 @@ impl<'i> fmt::Debug for TaxaResolver<'i> { } } -impl<'i> fmt::Debug for TripletResolver<'i> { +impl fmt::Debug for TripletResolver<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let [a, b, c] = self .triplet diff --git a/src/tree/ancestry.rs b/src/tree/ancestry.rs index 14f475b..670f638 100644 --- a/src/tree/ancestry.rs +++ b/src/tree/ancestry.rs @@ -23,7 +23,7 @@ pub(crate) struct ToRootFrom<'t, B, IN, TN> { current: usize, } -impl<'t, B, IN, TN> Iterator for ToRootFrom<'t, B, IN, TN> { +impl<B, IN, TN> Iterator for ToRootFrom<'_, B, IN, TN> { type Item = usize; fn next(&mut self) -> Option<Self::Item> { if let Some(tree) = self.tree { diff --git a/src/tree/progeny.rs b/src/tree/progeny.rs index 063fe70..4c3e01b 100644 --- a/src/tree/progeny.rs +++ b/src/tree/progeny.rs @@ -43,7 +43,7 @@ impl<'n, B, IN, TN> Children<'n, B, IN, TN> { } } -impl<'n, B, IN, TN> Iterator for Children<'n, B, IN, TN> { +impl<B, IN, TN> Iterator for Children<'_, B, IN, TN> { type Item = usize; fn next(&mut self) -> Option<Self::Item> { use Node as N; -- GitLab