From 05b5e5f4a2adfb38beb6d56d4898987f9e634cd8 Mon Sep 17 00:00:00 2001
From: Iago Bonnici <iago.bonnici@umontpellier.fr>
Date: Mon, 17 Feb 2025 16:49:39 +0100
Subject: [PATCH] Merge `clear` & `unclear` into a single table.

---
 src/bin/dmox.rs | 14 +++-----
 src/config.rs   |  3 +-
 src/lib.rs      | 89 ++++++++++++++++++++++---------------------------
 3 files changed, 45 insertions(+), 61 deletions(-)

diff --git a/src/bin/dmox.rs b/src/bin/dmox.rs
index e1315f9..7480492 100644
--- a/src/bin/dmox.rs
+++ b/src/bin/dmox.rs
@@ -104,13 +104,9 @@ struct Cli {
     #[arg(long, default_value_t = 6)]
     zlevel: u32,
 
-    /// Where to write the file summarizing assigned samples.
+    /// Where to write the file summarizing assigned / unassigned barcodes.
     #[arg(long)]
-    clear: PathBuf,
-
-    /// Where to write the file summarizing unassigned samples.
-    #[arg(long)]
-    unclear: PathBuf,
+    barcodes_table: PathBuf,
 
     /// Total number of reference barcode modules for each letter.
     #[arg(long, default_value_t = 96)]
@@ -181,8 +177,7 @@ fn run() -> Result<(), Error> {
         qx,
         id_tail,
         zlevel,
-        clear,
-        unclear,
+        barcodes_table,
         n_modules,
         module_size,
         n_writers,
@@ -232,8 +227,7 @@ fn run() -> Result<(), Error> {
         distance,
         max_distance,
         schema,
-        clear,
-        unclear,
+        barcodes_table_file: barcodes_table,
         sample: Box::new(move |id, infix| {
             let mut path = PathBuf::from(&samples);
             path.push(format!("{id}.{infix}.fq.gz"));
diff --git a/src/config.rs b/src/config.rs
index b46b476..9a9e0ea 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -15,8 +15,7 @@ pub struct Config {
     pub distance: Distance,
     pub max_distance: Option<usize>,
     pub schema: PathBuf,
-    pub clear: PathBuf,
-    pub unclear: PathBuf,
+    pub barcodes_table_file: PathBuf,
     pub sample: Box<SamplePath>,
     pub output_bx: bool,
     pub output_rx: bool,
diff --git a/src/lib.rs b/src/lib.rs
index 544c670..a4e359b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -129,21 +129,19 @@ pub fn demultiplex(cf: &Config) -> Result<(), Error> {
     let samples = Samples::parse(reader)?;
 
     //----------------------------------------------------------------------------------------------
-    println!("Open log streams.");
-    // (do it now to error early if they can't be opened, but actual writing there happens later)
-    let [mut clear_file, mut unclear_file] = [
-        (&cf.clear, "Barcode \t Correct reads \t Corrected reads\n"),
-        (&cf.unclear, "Barcode \t Reads\n"),
-    ]
-    .try_map(|(path, header)| -> Result<_, Error> {
+    println!("Open table stream.");
+    // (do it now to error early if it can't be opened, but actual writing there happens later)
+    let path = &cf.barcodes_table_file;
+    let header = "Barcode\tTotal\tCorrect_Reads\tCorrected_Reads\n";
+    let table_file = {
         println!("  {}", path.display().to_string().blue());
         let mut writer = ContextualizedWriter::from_file(path)?;
         writer
             .write_all(header.as_bytes())
             .with_context(|_| WriteLineErr { context: &writer })?;
         writer.bump_line();
-        Ok(writer)
-    })?;
+        writer
+    };
     let mut counters = IndexMap::new();
 
     //----------------------------------------------------------------------------------------------
@@ -324,12 +322,11 @@ pub fn demultiplex(cf: &Config) -> Result<(), Error> {
         }
 
         //------------------------------------------------------------------------------------------
-        // Find the corresponding matches to construct the barcode.
+        // Find the corresponding matches against each reference to construct the barcode.
         let mut bx = [0, 0, 0, 0]; // [{A, C}, {B, D}] <> [{I1}, {I2}]
-        let mut is_exact = true;
-        let mut is_corrected = false;
-        let mut is_unclear = false;
-        let mut sample_code = None; // Fill if that one is clear.
+        let mut all_clear = true; // Raise if all closest matches were unique.
+        let mut all_exact = true; // Raise if all match distances were null.
+        let mut sample_code = None; // Fill if the sample letter has unique closest match.
         let (m1, m2) = bx.split_at_mut(2);
         for ((i, _), (u, v), matches) in [
             (&i1, ((b'A', &mut ref_a), (b'C', &mut ref_c)), m1),
@@ -341,18 +338,17 @@ pub fn demultiplex(cf: &Config) -> Result<(), Error> {
             let q2 = &q2[1..]; // Remove (and ignore) stitch base.
             for ((q, (letter, reference)), m) in [(q2, u), (q1, v)].iter_mut().zip(matches) {
                 let Some((code, distance)) = reference.closest(q) else {
-                    is_unclear = true;
+                    all_clear = false;
                     continue;
                 };
                 if let Some(max) = cf.max_distance {
                     if distance > max {
-                        is_unclear = true;
+                        all_clear = false;
                         continue;
                     }
                 }
                 if distance > 0 {
-                    is_corrected = true;
-                    is_exact = false;
+                    all_exact = false;
                 }
                 *m = code + 1;
                 if *letter == samples.letter {
@@ -361,20 +357,23 @@ pub fn demultiplex(cf: &Config) -> Result<(), Error> {
             }
         }
 
-        // Update counters.
+        // Update counters,
+        // considering unclear sample as neither 'exact' or 'corrected'.
+        let [exact, corrected] =
+            [all_exact, !all_exact].map(|b| if all_clear { u64::from(b) } else { 0 });
         match counters.entry(bx) {
             Entry::Vacant(entry) => {
                 entry.insert(Counters {
-                    exact: u64::from(is_exact),
-                    corrected: u64::from(is_corrected),
-                    unclear: u64::from(is_unclear),
+                    total: 1,
+                    exact,
+                    corrected,
                 });
             }
             Entry::Occupied(mut entry) => {
                 let counters = entry.get_mut();
-                counters.exact += u64::from(is_exact);
-                counters.corrected += u64::from(is_corrected);
-                counters.unclear += u64::from(is_unclear);
+                counters.total += 1;
+                counters.exact += exact;
+                counters.corrected += corrected;
             }
         }
 
@@ -553,33 +552,20 @@ pub fn demultiplex(cf: &Config) -> Result<(), Error> {
 
     //----------------------------------------------------------------------------------------------
     println!("Writing clear/unclear files.");
+    let mut writer = table_file;
     for (code, counters) in counters {
         let Counters {
+            total,
             exact,
             corrected,
-            unclear,
         } = counters;
-        if unclear > 0 {
-            |w: &mut ContextualizedWriter<_>| -> Result<(), _> {
-                write_code(w, &code)?;
-                writeln!(w, "\t{unclear}")?;
-                w.bump_line();
-                Ok(())
-            }(&mut unclear_file)
-            .with_context(|_| WriteLineErr {
-                context: &unclear_file,
-            })?;
-        } else {
-            |w: &mut ContextualizedWriter<_>| -> Result<(), _> {
-                write_code(w, &code)?;
-                writeln!(w, "\t{exact}\t{corrected}")?;
-                w.bump_line();
-                Ok(())
-            }(&mut clear_file)
-            .with_context(|_| WriteLineErr {
-                context: &clear_file,
-            })?;
-        }
+        |w: &mut ContextualizedWriter<_>| -> Result<(), _> {
+            write_code(w, &code)?;
+            writeln!(w, "\t{total}\t{exact}\t{corrected}")?;
+            w.bump_line();
+            Ok(())
+        }(&mut writer)
+        .with_context(|_| WriteLineErr { context: &writer })?;
     }
 
     let elapsed = tic.elapsed();
@@ -587,11 +573,16 @@ pub fn demultiplex(cf: &Config) -> Result<(), Error> {
     Ok(())
 }
 
-// One value for:
+/// For every barcode.
 struct Counters {
+    /// How many times it has been produced.
+    total: u64,
+    /// How many time was it an exact match against all references
+    /// (impossible if it contains `00`).
     exact: u64,
+    /// How many it has been corrected to match against the references
+    /// (non-null distance) (also impossible if it contains `00`).
     corrected: u64,
-    unclear: u64,
 }
 
 fn write_code(writer: &mut impl Write, bx: &Barcode) -> Result<(), std::io::Error> {
-- 
GitLab