diff --git a/README.md b/README.md index a0b0e8b2c93ba6815650dc6a9cf61612532db5e8..a5e42d661ad5b11e6422bf41d62086c2df3da09b 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ This is a C implementation of the "Aphid" method, following its specifications within Nicolas Galtier's draft paper -*Phylogenetic conflicts: -distinguishing gene flow from incomplete lineage sorting* (2023), +*An approximate likelihood method reveals ancient gene flow + between human, chimpanzee and gorilla* (2023 *in prep.*), and the original C code. Sources available at: https://gitlab.mbb.cnrs.fr/ibonnici/aphid/. @@ -205,11 +205,11 @@ using the `<option_name> = <option_value>` syntax (see example below): - `fixed_tau1`: If set to a positive value, - parameter τ_1 will not be optimized but rather fixed to that value + parameter `τ_1` will not be optimized but rather fixed to that value (default = `-1`). - `fixed_tau`, `fixed_theta`, `fixed_pab`, `fixed_pac`, `fixed_pbc`, `fixed_panc`: - Same as above for parameters τ_2, θ, p_ab, p_ac, p_bc, and p_a. + Same as above for parameters `τ_2`, `θ`, `p_ab`, `p_ac`, `p_bc`, and `p_a`. If `nb_GF_times` is set to `1`, then `fixed_panc` is automatically set to `1.0`. @@ -222,64 +222,85 @@ and can be used for inserting comments in the control file. Parameter estimates, ILS/GF estimated contributions, -discordant topology imbalance, +discordant topology imbalance and various descriptors of the data and analysis are written to the standard output (terminal). -If `verbose=1`, the output is detailed and human-readable. -If `verbose=0`, the output is less detailed and written as a single line, -in the following format: - -``` -nb_gene, \ -ntopo0, \ -ntopo1, \ -ntopo2, \ -ntopo3, \ -av_lg, \ -tau1, \ -tau2, \ -theta, \ -pab, \ -pac, \ -pbc, \ -pa, \ -no_event, \ -noconflict_ILS, \ -noconflict_GF, \ -conflict_ILS, \ -conflict_GF, \ -imbalance_ILS, \ -dominant_ILS, \ -imbalance_GF, \ -dominant_GF, \ -max_lnL +If `verbose` is set to 1, the output is detailed and human-readable. +If `verbose` is set to 0, the output is less detailed +and written as a single line in the following format: + +``` +_nb_gene, \ +ntopo0, \ +ntopo1, \ +ntopo2, \ +ntopo3, \ +av_lg, \ +tau1, \ +tau1_low, \ +tau1_high, \ +tau2, \ +tau2_low, \ +tau2_high, \ +theta, \ +theta_low, \ +theta_high, \ +pab, \ +pab_low, \ +pab_high, \ +pac, \ +pac_low, \ +pac_high, \ +pbc, \ +pbc_low, \ +pbc_high, \ +pa, \ +pa_low, \ +pa_high, \ +no_event, \ +noconflict_ILS, \ +noconflict_GF, \ +conflict_ILS, \ +conflict_ILS_low, \ +conflict_ILS_high, \ +conflict_GF, \ +conflict_GF_low, \ +conflict_GF_high, \ +imbalance_ILS, \ +dominant_ILS, \ +imbalance_GF, \ +dominant_GF, \ +max_lnL_ ``` -With: +where: - `nb_gene`: - Number of analyzed gene trees after filtering - (detailed output gives more information on filtering steps). + Number of analyzed gene trees, after filtering + (detailed output gives mor einformation on filtering steps). -- `ntopo0 ntopo1, ntopo2, ntopo3`: - Number of analyzed gene trees with - an `((A,B),C)`, `((A,C),B)`, `((B,C),A)` and `(A,B,C)=unresolved` topology, - respectively. +- `ntopo, ntopo1, ntopo2, ntopo3`: + Number of analyzed gene trees + with an `((A,B),C)`, `((A,C),B)`, `((B,C),A)` + and `(A,B,C)` (unresolved) topology, respectively. - `av_lg`: Mean locus length of analyzed gene trees. - `tau1, tau2, theta, pab, pac, pbc, pa`: - Parameter estimates. + Parameters estimates, + with the `_low` and `_high` suffix indicating confidence interval boundaries. - `no_event`: - Estimated contribution of the no_event scenario. + Estimated contribution of the `no_event` scenario - `no_conflict_ILS`: - Estimated contribution of the ILS scenario with an `((A,B),C)` topology. + Estimated contribution of the ILS scenario + with an `((A,B),C)` topology. - `no_conflict_GF`: - Estimated contribution of GF scenarios with an `((A,B),C)` topology. + Estimated contribution of GF scenarios + with an `((A,B),C)` topology. - `conflict_ILS`: Estimated contribution of ILS scenarios @@ -296,13 +317,13 @@ With: The most prevalent discordant topology associated with ILS. - `imbalance_GF`: - The estimated discordant topology imbalance associated with GF. + Estimated discordant topology imbalance associated with GF. - `dominant_ILS`: The most prevalent discordant topology associated with GF. - `max_lnL`: - The maximum likelihood. + Maximum likelihood. The `no_event`, `no_conflict_ILS` and `no_conflict_GF` posterior estimates are provided. @@ -329,7 +350,7 @@ of each scenario for each gene tree. ### Gene-Tree File -`exemple.in` +`example.in` ``` ((out1:0.07,out2:0.07):0.03,(oth1:0.06,((A:0.02,B:0.02):0.01,C:0.03):0.03):0.04); 1000 canonical ((out1:0.068,out2:0.066):0.028,(oth1:0.059,((A:0.042,C:0.044):0.002,B:0.045):0.017):0.036); 500 discordant_tall @@ -355,7 +376,7 @@ of each scenario for each gene tree. ### Taxon file -`exemple.tax` +`example.tax` ``` triplet: A, B, C outgroup: out1, out2 @@ -364,7 +385,7 @@ other: oth1 ### Control File -`exemple.opt` +`example.opt` ``` # Gene tree filtering. require_triplet_other_monophyly = 1 diff --git a/aphid.0.10.c b/aphid.0.11.c similarity index 95% rename from aphid.0.10.c rename to aphid.0.11.c index 7e869dfd8289d3523f4437ddb6b3fcf63da539f1..fdfda90d898cf9c0fe7a5cb2a37d6745e1f79d37 100644 --- a/aphid.0.10.c +++ b/aphid.0.11.c @@ -444,9 +444,14 @@ void find_dubious_genes(struct gene_data* gd, int nb_gene, struct option* opt){ ratio_mean=meantl/meanntl; for(i=0;i<nb_gene;i++){ ratio_i=gd[i].triplet_l/gd[i].non_triplet_l; - if(gd[i].to_ignore==0 && (ratio_i/ratio_mean>opt->max_clock_ratio || ratio_mean/ratio_i>opt->max_clock_ratio)){ +//if(gd[i].to_ignore==0) printf("ratio_i %f, ratio_mean %f, max %f\n", ratio_i, ratio_mean, opt->max_clock_ratio); + if(gd[i].to_ignore==0 && (ratio_i/ratio_mean>opt->max_clock_ratio || ratio_mean/ratio_i>opt->max_clock_ratio)) gd[i].to_ignore=9; - } +//if(gd[i].to_ignore==0 && (ratio_i>opt->max_clock_ratio || 1./ratio_i>opt->max_clock_ratio)){ +// gd[i].to_ignore=9; +// printf("slow fast\n"); +//} + } //clock departure 2: non-clock triplet @@ -474,7 +479,7 @@ void print_ignored(struct gene_data* gd, int nb_gene){ printf("ignoring %d genes: ", nbign_tot); printf("%d missing triplet; %d no outgroup; %d unexpected topology; %d unexpected branch lengths\n", nbign[1], nbign[2], nbign[3]+nbign[4]+nbign[5]+nbign[6], nbign[7]+nbign[8]+ nbign[9]+nbign[10]); - if(nbign[3]+nbign[4]+nbign[5]+nbign[6]) printf("[unexpected topology: %d triplet paraphyly; %d wrong root; %d triplet+outgroup paraphyly; %d outgroup paraphyly]\n", nbign[3], nbign[4], nbign[5], nbign[6]); + if(nbign[3]+nbign[4]+nbign[5]+nbign[6]) printf("[unexpected topology: %d triplet paraphyly; %d wrong root; %d triplet+other paraphyly; %d outgroup paraphyly]\n", nbign[3], nbign[4], nbign[5], nbign[6]); if(nbign[10]>0) printf("[unexpected branch lengths: %d long branch; %d extreme alpha_i; %d slow/fast triplet; %d non-clock triplet]\n", nbign[7], nbign[8], nbign[9], nbign[10]); //user is expert and has activated triplet-clock filter (not mentioned in the doc) else @@ -2134,19 +2139,19 @@ void print_par(struct param* par, struct confidence_interval* ci, int nbci){ if(nbci==0){ printf("tau1: %f, tau2: %f, theta: %f\n", par->tau1, par->tau2, par->theta); printf("Pac: %f, Pbc: %f, Pab:%f\n", par->pac, par->pbc, par->pab); - printf("Pold: %f\n", par->pone); + printf("Pa: %f\n", par->pone); } if(nbci==1){ printf("tau1: %f, tau2: %f, theta: %f [%f-%f]\n", par->tau1, par->tau2, par->theta, ci[0].master_param_min, ci[0].master_param_max); printf("Pac: %f, Pbc: %f, Pab:%f\n", par->pac, par->pbc, par->pab); - printf("Pold: %f\n", par->pone); + printf("Pa: %f\n", par->pone); } if(nbci==nb_parameters){ printf("tau1: %f [%f-%f], tau2: %f [%f-%f], theta: %f [%f-%f]\n", par->tau1, ci[0].master_param_min, ci[0].master_param_max, par->tau2, ci[1].master_param_min, ci[1].master_param_max, par->theta, ci[2].master_param_min, ci[2].master_param_max); - printf("Pac: %f [%f-%f], Pbc: %f [%f-%f], Pab:%f [%f-%f]\n", par->pac, ci[3].master_param_min, ci[3].master_param_max, par->pbc, ci[4].master_param_min, ci[4].master_param_max, par->pab, ci[5].master_param_min, ci[5].master_param_max); - printf("Pold: %f [%f-%f]\n", par->pone, ci[6].master_param_min, ci[6].master_param_max); + printf("Pab: %f [%f-%f], Pac: %f [%f-%f], Pbc:%f [%f-%f]\n", par->pab, ci[3].master_param_min, ci[3].master_param_max, par->pac, ci[4].master_param_min, ci[4].master_param_max, par->pbc, ci[5].master_param_min, ci[5].master_param_max); + printf("Pa: %f [%f-%f]\n", par->pone, ci[6].master_param_min, ci[6].master_param_max); } } @@ -2264,6 +2269,9 @@ void calculate_scenario_likelihood(struct gene_data* gd, struct param* par, int lal=gd->lgseq*par->alpha[gene]; for(i=0;i<NBRANCH;i++) expe[i]=lal*exp_lg[scenar][i]; +//for(i=0;i<NBRANCH;i++) printf("%d obs=%d exp=%f\n", scenar, obse[i], expe[i]); +//getchar(); + logL=dlogLdt1=dlogLdt2=dlogLdth=d2logLdt1=d2logLdt2=d2logLdth=0.; for(i=0;i<NBRANCH;i++){ logL+=(obse[i]*log(expe[i])-expe[i]-log_facto[obse[i]]); @@ -2334,6 +2342,8 @@ void calculate_gene_likelihood(struct gene_data* gd, struct param* par, struct o pab=par->pab; pac=par->pac; pbc=par->pbc; pone=par->pone; pils=exp(-2*(t2-t1)/th); +//printf("pils=%f\n", pils); + //below we note L=sum_i p[i] pr[i] , where p[i] is the probability of a scenario, pr[i] is the likelihood of a scenario, 1<=i<=nb scenario //scenario probabilities @@ -2350,6 +2360,10 @@ void calculate_gene_likelihood(struct gene_data* gd, struct param* par, struct o p[3*opt->nb_ILS_times+5+5*i]=p[3*opt->nb_ILS_times+6+5*i]=pbc*(1-pone)/(2.*(opt->nb_HGT_times-1)); } +//for(i=1;i<nb_scenario_plus1;i++){ +// printf("scenario proba: %f\n", p[i]); +//} + //scenario likelihoods and scenario likelihood derivatives wrt t1, t2, th for(i=1;i<nb_scenario_plus1;i++){ pr[i]=dprdt1[i]=dprdt2[i]=dprdth[i]=d2prdt1[i]=d2prdt2[i]=d2prdth[i]=0.; @@ -2771,7 +2785,6 @@ struct gene_likelihood* optimize_lnL_starting(struct gene_data* gdata, int nbg, } - /* optimize_lnL */ struct gene_likelihood* optimize_lnL(struct gene_data* gdata, int nbg, struct option opt, struct param* final_par, double* max_maxlnL){ struct param start_par, arg_par; @@ -2886,7 +2899,53 @@ void annotate_global(struct gene_likelihood* glh, int nb_gene, struct posterior* } +/* calculate_logLn_degenerate */ +double calculate_logLn_degenerate(struct gene_data* gdata, int nbg){ + + double lnLd, Ld_gene, lnLd_scenar; + int i, j; + int oa, ob, oc, od; + double ea, eb, ec, ed; + double oalogea, oblogeb, oclogec, odloged; + + lnLd=0; + + for(i=0;i<nbg;i++){ //for ech gene tree + + oa=(round)(gdata[i].lgseq*gdata[i].a); + ob=(round)(gdata[i].lgseq*gdata[i].b); + oc=(round)(gdata[i].lgseq*gdata[i].c); + od=(round)(gdata[i].lgseq*gdata[i].d); + Ld_gene=0.; + + for(j=0;j<nbg;j++){ //consider every (other or not) gene tree as a scenario, with expected = observed + lnLd_scenar=0.; + ea=gdata[j].lgseq*gdata[j].a; + eb=gdata[j].lgseq*gdata[j].b; + ec=gdata[j].lgseq*gdata[j].c; + ed=gdata[j].lgseq*gdata[j].d; + + if(ea>0) oalogea=oa*log(ea); else oalogea=0; + if(eb>0) oblogeb=ob*log(eb); else oblogeb=0; + if(ec>0) oclogec=oc*log(ec); else oclogec=0; + if(ed>0) odloged=od*log(ed); else odloged=0; + + lnLd_scenar+=oalogea-ea-log_facto[oa]; + lnLd_scenar+=oblogeb-eb-log_facto[ob]; + lnLd_scenar+=oclogec-ec-log_facto[oc]; + lnLd_scenar+=odloged-ed-log_facto[od]; + + Ld_gene+=exp(lnLd_scenar)/nbg; //each scenario has probability 1./nbg + } + + lnLd+=log(Ld_gene); + } + + + return lnLd; + +} /* calculate_ci */ @@ -3038,7 +3097,8 @@ int main(int argc, char** argv){ struct confidence_interval* ci; int nb_gene, nb_ignored, nb_sp, reti, nbok, nbci, i, j; int ntopo1, ntopo2, ntopo3, ntopo0; - double max_lnL; + int gof_ddl; + double max_lnL, lnL_degn; double opt_pab, opt_pac, opt_pbc, opt_pils; int npred_ils, npred_hgt, nils_a, nils_b, nhgt_a, nhgt_b; double av_lg, av_a, av_b, av_c, av_d, sum_post, thresh; @@ -3114,13 +3174,25 @@ int main(int argc, char** argv){ if(opt.verbose) printf("\n"); + // gof (not nested models so null ditrib to be approached via simulations) +/* + lnL_degn=calculate_logLn_degenerate(gdata, nb_gene); + gof_ddl=5*nb_gene-1-nb_parameters; + if(opt.verbose) printf("maxlnL:%f, deglnL:%f, 2delta_lnL:%f, ddl:%d, normal_deviate:%f\n", max_lnL, lnL_degn, 2*(lnL_degn-max_lnL), gof_ddl, (2*(lnL_degn-max_lnL)-gof_ddl)/sqrt(2*gof_ddl)); + //getchar(); +*/ + // confidence intervals + ci=check_alloc(nb_parameters, sizeof(struct confidence_interval)); + for(i=0;i<nb_parameters;i++){ + ci[i].master_param_min=ci[i].master_param_max=-1.; + ci[i].post_min.conflict_ILS=ci[i].post_max.conflict_ILS=ci[i].post_min.conflict_HGT=ci[i].post_max.conflict_HGT=-1.; + } if(opt.do_param_CI){ if(opt.verbose) printf("Confidence intervals:\n"); nbci=nb_parameters; - ci=check_alloc(nbci, sizeof(struct confidence_interval)); for(i=0;i<nb_parameters;i++){ - printf("%s...\n", parameter_name[i]); + if(opt.verbose) printf("%s...\n", parameter_name[i]); reti=calculate_ci(gdata, nb_gene, opt, optimal_par, max_lnL, parameter_name[i], ci+i); //if(reti==0) {nbci=0; break;} } @@ -3128,8 +3200,7 @@ int main(int argc, char** argv){ else if(opt.do_ILS_GF_CI){ if(opt.verbose) printf("Confidence intervals...\n"); nbci=1; - ci=check_alloc(nbci, sizeof(struct confidence_interval)); - reti=calculate_ci(gdata, nb_gene, opt, optimal_par, max_lnL, "theta", ci); + reti=calculate_ci(gdata, nb_gene, opt, optimal_par, max_lnL, "theta", ci+2); if(reti==0) nbci=0; } else nbci=0; @@ -3185,27 +3256,32 @@ int main(int argc, char** argv){ printf("posterior no-conflict GF : %.3f\n\n", post.noconflict_HGT); printf("posterior conflict ILS: %.3f", post.conflict_ILS); - if(nbci==1) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_ILS, ci[0].post_max.conflict_ILS); - if(nbci==nb_parameters) printf(" [%.3f-%.3f]", ci[2].post_min.conflict_ILS, ci[2].post_max.conflict_ILS); + if(nbci!=0) printf(" [%.3f-%.3f]", ci[2].post_min.conflict_ILS, ci[2].post_max.conflict_ILS); printf("\n"); printf("posterior conflict GF : %.3f", post.conflict_HGT); - if(nbci==1) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_HGT, ci[0].post_max.conflict_HGT); - if(nbci==nb_parameters) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_HGT, ci[0].post_max.conflict_HGT); + if(nbci!=0) printf(" [%.3f-%.3f]", ci[2].post_max.conflict_HGT, ci[2].post_min.conflict_HGT); printf("\n\n"); printf("posterior imbalance ILS: %.3f [%s dominant]\n", post.asymmetry_ILS, ctopol(post.dominant_ILS)); printf("posterior imbalance GF : %.3f [%s dominant]\n", post.asymmetry_HGT, ctopol(post.dominant_HGT)); printf("\n"); - -/* thresh=opt.posterior_threshold; - printf("%d genes with probability of ILS above %.3f \n", npred_ils, thresh); - printf("%d genes with probability of HGT above %.3f \n", npred_hgt, thresh); - printf("\n"); -*/ } //one-line output - if(!opt.verbose) printf("%d,%d,%d,%d,%d,%.1f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%s,%f,%s,%f\n", nb_gene, ntopo0, ntopo1, ntopo2, ntopo3, av_lg, optimal_par.tau1, optimal_par.tau2, optimal_par.theta, optimal_par.pab, optimal_par.pac, optimal_par.pbc, optimal_par.pone, post.basic, post.noconflict_ILS, post.noconflict_HGT, post.conflict_ILS, post.conflict_HGT, post.asymmetry_ILS, ctopol(post.dominant_ILS), post.asymmetry_HGT, ctopol(post.dominant_HGT), max_lnL); - + if(!opt.verbose){ + //printf("%d,%d,%d,%d,%d,%.1f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%s,%f,%s,%f\n", nb_gene, ntopo0, ntopo1, ntopo2, ntopo3, av_lg, optimal_par.tau1, optimal_par.tau2, optimal_par.theta, optimal_par.pab, optimal_par.pac, optimal_par.pbc, optimal_par.pone, post.basic, post.noconflict_ILS, post.noconflict_HGT, post.conflict_ILS, post.conflict_HGT, post.asymmetry_ILS, ctopol(post.dominant_ILS), post.asymmetry_HGT, ctopol(post.dominant_HGT), max_lnL); + printf("%d,%d,%d,%d,%d,%.1f,", nb_gene, ntopo0, ntopo1, ntopo2, ntopo3, av_lg); + printf("%f,%f,%f,", optimal_par.tau1, ci[0].master_param_min, ci[0].master_param_max); + printf("%f,%f,%f,", optimal_par.tau2, ci[1].master_param_min, ci[1].master_param_max); + printf("%f,%f,%f,", optimal_par.theta, ci[2].master_param_min, ci[2].master_param_max); + printf("%f,%f,%f,", optimal_par.pab, ci[3].master_param_min, ci[3].master_param_max); + printf("%f,%f,%f,", optimal_par.pac, ci[4].master_param_min, ci[4].master_param_max); + printf("%f,%f,%f,", optimal_par.pbc, ci[5].master_param_min, ci[5].master_param_max); + printf("%f,%f,%f,", optimal_par.pone, ci[6].master_param_min, ci[6].master_param_max); + printf("%f,%f,%f,", post.basic, post.noconflict_ILS, post.noconflict_HGT); + printf("%f,%f,%f,", post.conflict_ILS, ci[2].post_min.conflict_ILS, ci[2].post_max.conflict_ILS); + printf("%f,%f,%f,", post.conflict_HGT, ci[2].post_max.conflict_HGT, ci[2].post_min.conflict_HGT); + printf("%f,%s,%f,%s,%f\n", post.asymmetry_ILS, ctopol(post.dominant_ILS), post.asymmetry_HGT, ctopol(post.dominant_HGT), max_lnL); + } }