From 8c527f058b8a8dbef2ca6eb2c16838931d78cc0a Mon Sep 17 00:00:00 2001
From: Iago Bonnici <iago.bonnici@umontpellier.fr>
Date: Tue, 21 Nov 2023 08:56:34 +0100
Subject: [PATCH] Upgrade to 0.11.

---
 README.md                    | 119 +++++++++++++++++++--------------
 aphid.0.10.c => aphid.0.11.c | 126 ++++++++++++++++++++++++++++-------
 2 files changed, 171 insertions(+), 74 deletions(-)
 rename aphid.0.10.c => aphid.0.11.c (95%)

diff --git a/README.md b/README.md
index a0b0e8b..a5e42d6 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 This is a C implementation of the "Aphid" method,
 following its specifications within Nicolas Galtier's draft paper
-*Phylogenetic conflicts:
-distinguishing gene flow from incomplete lineage sorting* (2023),
+*An approximate likelihood method reveals ancient gene flow
+ between human, chimpanzee and gorilla* (2023 *in prep.*),
 and the original C code.
 
 Sources available at: https://gitlab.mbb.cnrs.fr/ibonnici/aphid/.
@@ -205,11 +205,11 @@ using the `<option_name> = <option_value>` syntax (see example below):
 
 - `fixed_tau1`:
   If set to a positive value,
-  parameter τ_1 will not be optimized but rather fixed to that value
+  parameter `τ_1` will not be optimized but rather fixed to that value
   (default = `-1`).
 
 - `fixed_tau`, `fixed_theta`, `fixed_pab`, `fixed_pac`, `fixed_pbc`, `fixed_panc`:
-  Same as above for parameters τ_2, θ, p_ab, p_ac, p_bc, and p_a.
+  Same as above for parameters `τ_2`, `θ`, `p_ab`, `p_ac`, `p_bc`, and `p_a`.
   If `nb_GF_times` is set to `1`,
   then `fixed_panc` is automatically set to `1.0`.
 
@@ -222,64 +222,85 @@ and can be used for inserting comments in the control file.
 
 Parameter estimates,
 ILS/GF estimated contributions,
-discordant topology imbalance,
+discordant topology imbalance
 and various descriptors of the data and analysis
 are written to the standard output (terminal).
-If `verbose=1`, the output is detailed and human-readable.
-If `verbose=0`, the output is less detailed and written as a single line,
-in the following format:
-
-```
-nb_gene,          \
-ntopo0,           \
-ntopo1,           \
-ntopo2,           \
-ntopo3,           \
-av_lg,            \
-tau1,             \
-tau2,             \
-theta,            \
-pab,              \
-pac,              \
-pbc,              \
-pa,               \
-no_event,         \
-noconflict_ILS,   \
-noconflict_GF,    \
-conflict_ILS,     \
-conflict_GF,      \
-imbalance_ILS,    \
-dominant_ILS,     \
-imbalance_GF,     \
-dominant_GF,      \
-max_lnL
+If `verbose` is set to 1, the output is detailed and human-readable.
+If `verbose` is set to 0, the output is less detailed
+and written as a single line in the following format:
+
+```  
+_nb_gene,          \
+ntopo0,            \
+ntopo1,            \
+ntopo2,            \
+ntopo3,            \
+av_lg,             \
+tau1,              \
+tau1_low,          \
+tau1_high,         \
+tau2,              \
+tau2_low,          \
+tau2_high,         \
+theta,             \
+theta_low,         \
+theta_high,        \
+pab,               \
+pab_low,           \
+pab_high,          \
+pac,               \
+pac_low,           \
+pac_high,          \
+pbc,               \
+pbc_low,           \
+pbc_high,          \
+pa,                \
+pa_low,            \
+pa_high,           \
+no_event,          \
+noconflict_ILS,    \
+noconflict_GF,     \
+conflict_ILS,      \
+conflict_ILS_low,  \
+conflict_ILS_high, \
+conflict_GF,       \
+conflict_GF_low,   \
+conflict_GF_high,  \
+imbalance_ILS,     \
+dominant_ILS,      \
+imbalance_GF,      \
+dominant_GF,       \
+max_lnL_
 ```
 
-With:
+where:
 
 - `nb_gene`:
-  Number of analyzed gene trees after filtering
-  (detailed output gives more information on filtering steps).
+  Number of analyzed gene trees, after filtering
+  (detailed output gives mor einformation on filtering steps).
 
-- `ntopo0 ntopo1, ntopo2, ntopo3`:
-  Number of analyzed gene trees with
-  an `((A,B),C)`, `((A,C),B)`, `((B,C),A)` and `(A,B,C)=unresolved` topology,
-  respectively.
+- `ntopo, ntopo1, ntopo2, ntopo3`:
+  Number of analyzed gene trees
+  with an `((A,B),C)`, `((A,C),B)`, `((B,C),A)`
+  and `(A,B,C)` (unresolved) topology, respectively.
 
 - `av_lg`:
   Mean locus length of analyzed gene trees.
 
 - `tau1, tau2, theta, pab, pac, pbc, pa`:
-  Parameter estimates.
+  Parameters estimates,
+  with the `_low` and `_high` suffix indicating confidence interval boundaries.
 
 - `no_event`:
-  Estimated contribution of the no_event scenario.
+  Estimated contribution of the `no_event` scenario
 
 - `no_conflict_ILS`:
-  Estimated contribution of the ILS scenario with an `((A,B),C)` topology.
+  Estimated contribution of the ILS scenario
+  with an `((A,B),C)` topology.
 
 - `no_conflict_GF`:
-  Estimated contribution of GF scenarios with an `((A,B),C)` topology.
+  Estimated contribution of GF scenarios
+  with an `((A,B),C)` topology.
 
 - `conflict_ILS`:
   Estimated contribution of ILS scenarios
@@ -296,13 +317,13 @@ With:
   The most prevalent discordant topology associated with ILS.
 
 - `imbalance_GF`:
-  The estimated discordant topology imbalance associated with GF.
+  Estimated discordant topology imbalance associated with GF.
 
 - `dominant_ILS`:
   The most prevalent discordant topology associated with GF.
 
 - `max_lnL`:
-  The maximum likelihood.
+  Maximum likelihood.
 
 The `no_event`, `no_conflict_ILS` and `no_conflict_GF` posterior estimates
 are provided.
@@ -329,7 +350,7 @@ of each scenario for each gene tree.
 
 ### Gene-Tree File
 
-`exemple.in`
+`example.in`
 ```
 ((out1:0.07,out2:0.07):0.03,(oth1:0.06,((A:0.02,B:0.02):0.01,C:0.03):0.03):0.04);	1000	canonical
 ((out1:0.068,out2:0.066):0.028,(oth1:0.059,((A:0.042,C:0.044):0.002,B:0.045):0.017):0.036); 500	discordant_tall
@@ -355,7 +376,7 @@ of each scenario for each gene tree.
 
 ### Taxon file
 
-`exemple.tax`
+`example.tax`
 ```
 triplet: A, B, C
 outgroup: out1, out2
@@ -364,7 +385,7 @@ other: oth1
 
 ### Control File
 
-`exemple.opt`
+`example.opt`
 ```
 # Gene tree filtering.
 require_triplet_other_monophyly = 1
diff --git a/aphid.0.10.c b/aphid.0.11.c
similarity index 95%
rename from aphid.0.10.c
rename to aphid.0.11.c
index 7e869df..fdfda90 100644
--- a/aphid.0.10.c
+++ b/aphid.0.11.c
@@ -444,9 +444,14 @@ void find_dubious_genes(struct gene_data* gd, int nb_gene, struct option* opt){
   ratio_mean=meantl/meanntl;
   for(i=0;i<nb_gene;i++){
     ratio_i=gd[i].triplet_l/gd[i].non_triplet_l;
-    if(gd[i].to_ignore==0 && (ratio_i/ratio_mean>opt->max_clock_ratio || ratio_mean/ratio_i>opt->max_clock_ratio)){
+//if(gd[i].to_ignore==0) printf("ratio_i %f, ratio_mean %f, max %f\n", ratio_i, ratio_mean, opt->max_clock_ratio);
+    if(gd[i].to_ignore==0 && (ratio_i/ratio_mean>opt->max_clock_ratio || ratio_mean/ratio_i>opt->max_clock_ratio))
       gd[i].to_ignore=9;
-    }
+//if(gd[i].to_ignore==0 && (ratio_i>opt->max_clock_ratio || 1./ratio_i>opt->max_clock_ratio)){
+//  gd[i].to_ignore=9;
+//  printf("slow fast\n");
+//}
+
   }
   
   //clock departure 2: non-clock triplet
@@ -474,7 +479,7 @@ void print_ignored(struct gene_data* gd, int nb_gene){
   
   printf("ignoring %d genes: ", nbign_tot);
   printf("%d missing triplet; %d no outgroup; %d unexpected topology; %d unexpected branch lengths\n", nbign[1], nbign[2], nbign[3]+nbign[4]+nbign[5]+nbign[6], nbign[7]+nbign[8]+ nbign[9]+nbign[10]);
-  if(nbign[3]+nbign[4]+nbign[5]+nbign[6]) printf("[unexpected topology: %d triplet paraphyly; %d wrong root; %d triplet+outgroup paraphyly; %d outgroup paraphyly]\n", nbign[3], nbign[4], nbign[5], nbign[6]);
+  if(nbign[3]+nbign[4]+nbign[5]+nbign[6]) printf("[unexpected topology: %d triplet paraphyly; %d wrong root; %d triplet+other paraphyly; %d outgroup paraphyly]\n", nbign[3], nbign[4], nbign[5], nbign[6]);
   if(nbign[10]>0) 
     printf("[unexpected branch lengths: %d long branch; %d extreme alpha_i; %d slow/fast triplet; %d non-clock triplet]\n", nbign[7], nbign[8], nbign[9], nbign[10]); //user is expert and has activated triplet-clock filter (not mentioned in the doc)
   else
@@ -2134,19 +2139,19 @@ void print_par(struct param* par, struct confidence_interval* ci, int nbci){
   if(nbci==0){
     printf("tau1: %f, tau2: %f, theta: %f\n", par->tau1, par->tau2, par->theta);
     printf("Pac: %f, Pbc: %f, Pab:%f\n", par->pac, par->pbc, par->pab);
-    printf("Pold: %f\n", par->pone);
+    printf("Pa: %f\n", par->pone);
   }
   
   if(nbci==1){
     printf("tau1: %f, tau2: %f, theta: %f [%f-%f]\n", par->tau1, par->tau2, par->theta, ci[0].master_param_min, ci[0].master_param_max);
     printf("Pac: %f, Pbc: %f, Pab:%f\n", par->pac, par->pbc, par->pab);
-    printf("Pold: %f\n", par->pone);  
+    printf("Pa: %f\n", par->pone);  
   }
 
   if(nbci==nb_parameters){
     printf("tau1: %f [%f-%f], tau2: %f [%f-%f], theta: %f [%f-%f]\n", par->tau1, ci[0].master_param_min, ci[0].master_param_max, par->tau2, ci[1].master_param_min, ci[1].master_param_max, par->theta, ci[2].master_param_min, ci[2].master_param_max);
-    printf("Pac: %f [%f-%f], Pbc: %f [%f-%f], Pab:%f [%f-%f]\n", par->pac, ci[3].master_param_min, ci[3].master_param_max, par->pbc, ci[4].master_param_min, ci[4].master_param_max, par->pab, ci[5].master_param_min, ci[5].master_param_max);
-    printf("Pold: %f [%f-%f]\n", par->pone, ci[6].master_param_min, ci[6].master_param_max);  
+    printf("Pab: %f [%f-%f], Pac: %f [%f-%f], Pbc:%f [%f-%f]\n", par->pab, ci[3].master_param_min, ci[3].master_param_max, par->pac, ci[4].master_param_min, ci[4].master_param_max, par->pbc, ci[5].master_param_min, ci[5].master_param_max);
+    printf("Pa: %f [%f-%f]\n", par->pone, ci[6].master_param_min, ci[6].master_param_max);  
   }
 
 }
@@ -2264,6 +2269,9 @@ void calculate_scenario_likelihood(struct gene_data* gd, struct param* par, int
   lal=gd->lgseq*par->alpha[gene];
   for(i=0;i<NBRANCH;i++) expe[i]=lal*exp_lg[scenar][i];
 
+//for(i=0;i<NBRANCH;i++) printf("%d obs=%d exp=%f\n", scenar, obse[i], expe[i]);
+//getchar();
+
   logL=dlogLdt1=dlogLdt2=dlogLdth=d2logLdt1=d2logLdt2=d2logLdth=0.;
   for(i=0;i<NBRANCH;i++){
     logL+=(obse[i]*log(expe[i])-expe[i]-log_facto[obse[i]]);    
@@ -2334,6 +2342,8 @@ void calculate_gene_likelihood(struct gene_data* gd, struct param* par, struct o
   pab=par->pab; pac=par->pac; pbc=par->pbc; pone=par->pone;
   pils=exp(-2*(t2-t1)/th);
   
+//printf("pils=%f\n", pils);
+  
   //below we note L=sum_i p[i] pr[i] , where p[i] is the probability of a scenario, pr[i] is the likelihood of a scenario, 1<=i<=nb scenario
   
   //scenario probabilities
@@ -2350,6 +2360,10 @@ void calculate_gene_likelihood(struct gene_data* gd, struct param* par, struct o
     p[3*opt->nb_ILS_times+5+5*i]=p[3*opt->nb_ILS_times+6+5*i]=pbc*(1-pone)/(2.*(opt->nb_HGT_times-1));  
   }
 
+//for(i=1;i<nb_scenario_plus1;i++){
+//  printf("scenario proba: %f\n", p[i]);
+//}
+
   //scenario likelihoods and scenario likelihood derivatives wrt t1, t2, th
   for(i=1;i<nb_scenario_plus1;i++){
     pr[i]=dprdt1[i]=dprdt2[i]=dprdth[i]=d2prdt1[i]=d2prdt2[i]=d2prdth[i]=0.;
@@ -2771,7 +2785,6 @@ struct gene_likelihood* optimize_lnL_starting(struct gene_data* gdata, int nbg,
 }
 
 
-
 /* optimize_lnL */
 struct gene_likelihood* optimize_lnL(struct gene_data* gdata, int nbg, struct option opt, struct param* final_par, double* max_maxlnL){
   struct param start_par, arg_par;
@@ -2886,7 +2899,53 @@ void annotate_global(struct gene_likelihood* glh, int nb_gene, struct posterior*
 }
 
 
+/* calculate_logLn_degenerate */
+double calculate_logLn_degenerate(struct gene_data* gdata, int nbg){
+
+  double lnLd, Ld_gene, lnLd_scenar;
+  int i, j;
+  int oa, ob, oc, od;
+  double ea, eb, ec, ed;
+  double oalogea, oblogeb, oclogec, odloged;
+  
+  lnLd=0;
+  
+  for(i=0;i<nbg;i++){ //for ech gene tree
+
+    oa=(round)(gdata[i].lgseq*gdata[i].a);
+    ob=(round)(gdata[i].lgseq*gdata[i].b);
+    oc=(round)(gdata[i].lgseq*gdata[i].c);
+    od=(round)(gdata[i].lgseq*gdata[i].d);
+    Ld_gene=0.;
+    
+    for(j=0;j<nbg;j++){ //consider every (other or not) gene tree as a scenario, with expected = observed
+      lnLd_scenar=0.;
+      ea=gdata[j].lgseq*gdata[j].a;
+      eb=gdata[j].lgseq*gdata[j].b;
+      ec=gdata[j].lgseq*gdata[j].c;
+      ed=gdata[j].lgseq*gdata[j].d;
+      
+      if(ea>0) oalogea=oa*log(ea); else oalogea=0;
+      if(eb>0) oblogeb=ob*log(eb); else oblogeb=0;
+      if(ec>0) oclogec=oc*log(ec); else oclogec=0;
+      if(ed>0) odloged=od*log(ed); else odloged=0;
+    
+      lnLd_scenar+=oalogea-ea-log_facto[oa];
+      lnLd_scenar+=oblogeb-eb-log_facto[ob];
+      lnLd_scenar+=oclogec-ec-log_facto[oc];
+      lnLd_scenar+=odloged-ed-log_facto[od];
+
+      Ld_gene+=exp(lnLd_scenar)/nbg;    //each scenario has probability 1./nbg
+    }        
+
 
+    lnLd+=log(Ld_gene);  
+  }
+  
+   
+  return lnLd;
+
+}
 
 
 /* calculate_ci */
@@ -3038,7 +3097,8 @@ int main(int argc, char** argv){
   struct confidence_interval* ci;
   int nb_gene, nb_ignored, nb_sp, reti, nbok, nbci, i, j;
   int ntopo1, ntopo2, ntopo3, ntopo0;
-  double max_lnL;
+  int gof_ddl;
+  double max_lnL, lnL_degn;
   double opt_pab, opt_pac, opt_pbc, opt_pils;
   int npred_ils, npred_hgt, nils_a, nils_b, nhgt_a, nhgt_b;
   double av_lg, av_a, av_b, av_c, av_d, sum_post, thresh;
@@ -3114,13 +3174,25 @@ int main(int argc, char** argv){
 
   if(opt.verbose) printf("\n");
   
+  // gof (not nested models so null ditrib to be approached via simulations)
+/*  
+  lnL_degn=calculate_logLn_degenerate(gdata, nb_gene);
+  gof_ddl=5*nb_gene-1-nb_parameters;
+  if(opt.verbose) printf("maxlnL:%f, deglnL:%f, 2delta_lnL:%f, ddl:%d, normal_deviate:%f\n", max_lnL, lnL_degn, 2*(lnL_degn-max_lnL), gof_ddl, (2*(lnL_degn-max_lnL)-gof_ddl)/sqrt(2*gof_ddl));
+  //getchar();
+*/
+
   // confidence intervals
+  ci=check_alloc(nb_parameters, sizeof(struct confidence_interval));
+  for(i=0;i<nb_parameters;i++){
+    ci[i].master_param_min=ci[i].master_param_max=-1.;
+    ci[i].post_min.conflict_ILS=ci[i].post_max.conflict_ILS=ci[i].post_min.conflict_HGT=ci[i].post_max.conflict_HGT=-1.;
+  }
   if(opt.do_param_CI){
     if(opt.verbose) printf("Confidence intervals:\n");     
     nbci=nb_parameters;
-    ci=check_alloc(nbci, sizeof(struct confidence_interval));
     for(i=0;i<nb_parameters;i++){
-      printf("%s...\n", parameter_name[i]);
+      if(opt.verbose) printf("%s...\n", parameter_name[i]);
       reti=calculate_ci(gdata, nb_gene, opt, optimal_par, max_lnL, parameter_name[i], ci+i);
       //if(reti==0) {nbci=0; break;}
     }
@@ -3128,8 +3200,7 @@ int main(int argc, char** argv){
   else if(opt.do_ILS_GF_CI){
     if(opt.verbose) printf("Confidence intervals...\n");
     nbci=1;
-    ci=check_alloc(nbci, sizeof(struct confidence_interval));
-    reti=calculate_ci(gdata, nb_gene, opt, optimal_par, max_lnL, "theta", ci);
+    reti=calculate_ci(gdata, nb_gene, opt, optimal_par, max_lnL, "theta", ci+2);
     if(reti==0) nbci=0;
   }
   else nbci=0;
@@ -3185,27 +3256,32 @@ int main(int argc, char** argv){
     printf("posterior no-conflict GF : %.3f\n\n", post.noconflict_HGT);
         
     printf("posterior conflict ILS: %.3f", post.conflict_ILS);
-    if(nbci==1) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_ILS, ci[0].post_max.conflict_ILS);
-    if(nbci==nb_parameters) printf(" [%.3f-%.3f]", ci[2].post_min.conflict_ILS, ci[2].post_max.conflict_ILS);
+    if(nbci!=0) printf(" [%.3f-%.3f]", ci[2].post_min.conflict_ILS, ci[2].post_max.conflict_ILS);
     printf("\n");
     printf("posterior conflict GF : %.3f", post.conflict_HGT);
-    if(nbci==1) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_HGT, ci[0].post_max.conflict_HGT);
-    if(nbci==nb_parameters) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_HGT, ci[0].post_max.conflict_HGT);
+    if(nbci!=0) printf(" [%.3f-%.3f]", ci[2].post_max.conflict_HGT, ci[2].post_min.conflict_HGT);
     printf("\n\n");   
     printf("posterior imbalance ILS: %.3f     [%s dominant]\n", post.asymmetry_ILS, ctopol(post.dominant_ILS));
     printf("posterior imbalance GF : %.3f     [%s dominant]\n", post.asymmetry_HGT, ctopol(post.dominant_HGT));
     printf("\n");  
-            
-/*  thresh=opt.posterior_threshold;
-    printf("%d genes with probability of ILS above %.3f \n", npred_ils, thresh);
-    printf("%d genes with probability of HGT above %.3f \n", npred_hgt, thresh);
-    printf("\n");  
-*/
   }
   
   //one-line output
-  if(!opt.verbose) printf("%d,%d,%d,%d,%d,%.1f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%s,%f,%s,%f\n", nb_gene, ntopo0, ntopo1, ntopo2, ntopo3, av_lg, optimal_par.tau1, optimal_par.tau2, optimal_par.theta, optimal_par.pab, optimal_par.pac, optimal_par.pbc, optimal_par.pone, post.basic, post.noconflict_ILS, post.noconflict_HGT, post.conflict_ILS, post.conflict_HGT, post.asymmetry_ILS, ctopol(post.dominant_ILS), post.asymmetry_HGT, ctopol(post.dominant_HGT), max_lnL);
-  
+  if(!opt.verbose){ 
+    //printf("%d,%d,%d,%d,%d,%.1f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%s,%f,%s,%f\n", nb_gene, ntopo0, ntopo1, ntopo2, ntopo3, av_lg, optimal_par.tau1, optimal_par.tau2, optimal_par.theta, optimal_par.pab, optimal_par.pac, optimal_par.pbc, optimal_par.pone, post.basic, post.noconflict_ILS, post.noconflict_HGT, post.conflict_ILS, post.conflict_HGT, post.asymmetry_ILS, ctopol(post.dominant_ILS), post.asymmetry_HGT, ctopol(post.dominant_HGT), max_lnL);
+    printf("%d,%d,%d,%d,%d,%.1f,", nb_gene, ntopo0, ntopo1, ntopo2, ntopo3, av_lg);
+    printf("%f,%f,%f,", optimal_par.tau1, ci[0].master_param_min, ci[0].master_param_max);
+    printf("%f,%f,%f,", optimal_par.tau2, ci[1].master_param_min, ci[1].master_param_max);
+    printf("%f,%f,%f,", optimal_par.theta, ci[2].master_param_min, ci[2].master_param_max);
+    printf("%f,%f,%f,", optimal_par.pab, ci[3].master_param_min, ci[3].master_param_max); 
+    printf("%f,%f,%f,", optimal_par.pac, ci[4].master_param_min, ci[4].master_param_max);
+    printf("%f,%f,%f,", optimal_par.pbc, ci[5].master_param_min, ci[5].master_param_max);
+    printf("%f,%f,%f,", optimal_par.pone, ci[6].master_param_min, ci[6].master_param_max);
+    printf("%f,%f,%f,", post.basic, post.noconflict_ILS, post.noconflict_HGT);
+    printf("%f,%f,%f,", post.conflict_ILS, ci[2].post_min.conflict_ILS, ci[2].post_max.conflict_ILS);
+    printf("%f,%f,%f,", post.conflict_HGT, ci[2].post_max.conflict_HGT, ci[2].post_min.conflict_HGT);
+    printf("%f,%s,%f,%s,%f\n", post.asymmetry_ILS, ctopol(post.dominant_ILS), post.asymmetry_HGT, ctopol(post.dominant_HGT), max_lnL);
+  }
 }
 
 
-- 
GitLab