diff --git a/aphid.0.9.c b/aphid.0.10.c similarity index 75% rename from aphid.0.9.c rename to aphid.0.10.c index bad1d37c6e9d92b84dbc4b8ee59361dc9bd68b49..7e869dfd8289d3523f4437ddb6b3fcf63da539f1 100644 --- a/aphid.0.9.c +++ b/aphid.0.10.c @@ -4,8 +4,9 @@ #include <math.h> #define MAXLLINE 1000 //max line length in option and taxon files -#define MAXLNAME 200 //max taxon name length -#define MAXNSP 200 //max nb of species in input trees +#define MAXLNAME 500 //max taxon name length +#define MAXNSP 500 //max nb of species in input trees +#define MAX_PARAMETER_NAME_LG 50 #define QuasiZero 0.000001 @@ -13,29 +14,31 @@ #define NBRANCH 4 //relevant ranches: a, b, c, d #define NTIME 3 //time-dimensioned parameter: tau1, tau2, theta -#define MAX_BL 200 //max branch length * seq length -#define MAX_ALPHA 10 //max ratio between ref lg and mean ref lg -#define MAX_CLOCK 2 //max ratio between triplet root-to-tip and outgroup+other root-to-tip -#define TRIPLET_CLOCK_THRESH 0.001 //max p-value of max triplet branch length knowing mean triplet branch length - -#define MIN_D 0.5 //if d*lgseq is below this threshold triplet topology will be condidered unresolved - #define MAX_PHGT 0.666 //maximal value for pab+pac+pbc -#define MAX_BACKWARD_MOVES 10 -#define MAX_ITERATION 1000 -#define CONVERGENCE 0.000001 -#define DEFAULT_POSTERIOR_THRESHOLD 0.90 +#define DEFAULT_POSTERIOR_THRESHOLD 0.95 +#define DEFAULT_nb_ILS_times 1 #define DEFAULT_nb_HGT_times 2 -#define DEFAULT_HGT_TIME_1 0.5 -#define DEFAULT_HGT_TIME_2 1 -#define MAX_nb_HGT_times 100 +#define DEFAULT_HGT_TIME_1 1 +#define DEFAULT_HGT_TIME_2 0.5 +#define DEFAULT_STRICT 0 +#define DEFAULT_REQUIRE_TO_MONOPH 1 +#define DEFAULT_REQUIRE_OUTG_MONOPH 1 +#define DEFAULT_MAX_ALPHAI 10 +#define DEFAULT_MAX_BL 200 +#define DEFAULT_MAX_CLOCK_RATIO 2 +#define DEFAULT_TRIPLET_CLOCK_THRESH 0.0 +#define DEFAULT_MIN_D 0.5 +#define DEFAULT_MAX_BACKWARD_MOVES 10 +#define DEFAULT_MAX_ITERATIONS 1000 +#define DEFAULT_CONVERGENCE 0.000001 +#define DEFAULT_DO_CI1 0 +#define DEFAULT_DO_CI2 0 +#define DEFAULT_CI_THR 1.92 -#define MAX_PARAMETER_NAME_LG 50 - - -#define NB_FILTERING_CASES 9 +#define MAX_nb_HGT_times 100 +#define NB_FILTERING_CASES 11 int debug; @@ -65,6 +68,8 @@ double ***exp_lg_coeff; double theta_start[NB_THETA_START]={0.0001, 0.001, 0.005, 0.01}; int nb_scenario_plus1; +char** scenario_name; +//int** scenario_type; //1=basic; 2=ILS; 3=HGT int it; int nb_parameters; @@ -72,11 +77,26 @@ char** parameter_name; struct option{ + int nb_ILS_times; //number of times for the second (oldest) ILS time; uniform distribution over these times assumed, with mean equal to t2+theta/6+theta/2 + double* discr_expo_mean1; //means of equal-weights bins of a discretized exponential distribution of mean 1 int nb_HGT_times; double* HGT_time_coeff; //the ith possible HGT time is HGT_time_coeff[i]*t1 + int strict_no_event; //if true, under the no-event scenario, the younger coalescence time must be <t2; free otherwise (I didn't find this useful on simul) + + int require_triplet_other_monophyly; + int require_outgroup_monophyly; + double max_alphai; + double max_bl; + double max_clock_ratio; + double triplet_clock_thresh; + + int max_backward_moves; + int max_iterations; + double convergence; + double posterior_threshold; //a posterior above this threshold => a scenario or scenario type is considered well supported, for a specific locus + double min_d; //if d*lgseq is below this threshold triplet topology will be condidered unresolved - int verbose; //1=normal output; 0=one-line output int optimize_tau1, optimize_tau2, optimize_theta; int optimize_pab, optimize_pac, optimize_pbc, optimize_pone; @@ -85,6 +105,8 @@ struct option{ int do_ILS_GF_CI, do_param_CI; //confidence interval double CI_max_lnL_diff; //max difference in log likelihood between optimum and any theta value in confidence interval + + int verbose; //1=normal output; 0=one-line output }; @@ -94,7 +116,7 @@ struct gene_data{ double a, b, c, d, e, l; double triplet_l, non_triplet_l; //for clock-likeness check int lgseq; - int to_ignore; //0=ok ; 1=missing >=1 triplet sp ; 2=missing all outgroup sp ; 3=missing all other sp ; 4=non-monophymetis triplet ; 5=non-monophyletic outgroup ; 6=non-monophymetis triplet+outgroup ; 7=dubious branch lengths ; 8=dubious ref length + int to_ignore; //0=ok ; 1=missing >=1 triplet sp ; 2=missing all outgroup sp ; 3=non-monophyletic triplet ; 5=non-monophyletic triplet + other ; 6=non-monophyletic outgroup ; 7=long branch; 8=extreme alpha_i ; 9=slow-fast triplet ; 10=non-clock-like triplet }; @@ -175,6 +197,7 @@ struct posterior{ double noconflict_ILS, conflict_ILS; double noconflict_HGT, conflict_HGT; double asymmetry_ILS, asymmetry_HGT; + int dominant_ILS, dominant_HGT; //1: ((AC)B) > ((BC)A) ; 2: ((AC)B) < ((BC)A) }; @@ -287,20 +310,47 @@ void max_rank(double* tab, int n, double* max, int* max_r){ +/* discretize_exponential */ +/* returns an pointer towards an array of nb_bin positive values, which are the means of a discretization of the exponential distribution of mean mean in nb_bin equally weight bins */ +double* discretize_exponential(double mean, int nb_bin){ + int i; + double lambda; + double* boundaries; + double* bin_means; + + lambda=1/mean; + boundaries=(double*)check_alloc(nb_bin, sizeof(double)); + bin_means=(double*)check_alloc(nb_bin, sizeof(double)); + + //calculate boundaries + boundaries[0]=0; + for(i=1;i<nb_bin;i++) + boundaries[i]=-log(exp(-lambda*boundaries[i-1])-1./nb_bin)/lambda; + + //calculate bin means + for(i=0;i<nb_bin-1;i++) + bin_means[i]=nb_bin*(boundaries[i]*exp(-lambda*boundaries[i])-boundaries[i+1]*exp(-lambda*boundaries[i+1])+(exp(-lambda*boundaries[i])-exp(-lambda*boundaries[i+1]))/lambda); + bin_means[nb_bin-1]=nb_bin*(boundaries[nb_bin-1]*exp(-lambda*boundaries[nb_bin-1])+exp(-lambda*boundaries[i])/lambda); + + return bin_means; +} + + + /* has_long_branches */ -int has_long_branches(struct gene_data* gd){ +int has_long_branches(struct gene_data* gd, struct option *opt){ - if(gd->a*gd->lgseq>MAX_BL) return 1; - if(gd->b*gd->lgseq>MAX_BL) return 1; - if(gd->c*gd->lgseq>MAX_BL) return 1; - if(gd->d*gd->lgseq>MAX_BL) return 1; - if(gd->e*gd->lgseq>MAX_BL) return 1; + if(gd->a*gd->lgseq>opt->max_bl) return 1; + if(gd->b*gd->lgseq>opt->max_bl) return 1; + if(gd->c*gd->lgseq>opt->max_bl) return 1; + if(gd->d*gd->lgseq>opt->max_bl) return 1; + if(gd->e*gd->lgseq>opt->max_bl) return 1; return 0; } /* has_unequal_branches */ -int has_unequal_branches(struct gene_data* gd){ +int has_unequal_branches(struct gene_data* gd, struct option* opt){ int i, b1, b2, b3; int maxb; @@ -334,13 +384,11 @@ int has_unequal_branches(struct gene_data* gd){ pval=0; - for(i=maxb;i<MAX_BL;i++){ + for(i=maxb;i<opt->max_bl;i++){ pval+=exp(i*log(meanb)-meanb-log_facto[i]); } -//printf("%f %f %f %f, ig=%d, max=%d, mean=%f, pval:%f\n", gd->a, gd->b, gd->c, gd->d, gd->to_ignore, maxb, meanb, pval); -//getchar(); - if(pval<TRIPLET_CLOCK_THRESH) + if(pval<opt->triplet_clock_thresh) return 1; return 0; @@ -371,7 +419,7 @@ void copy_data(struct gene_data* from, struct gene_data* to){ /* find_dubious_genes */ -void find_dubious_genes(struct gene_data* gd, int nb_gene){ +void find_dubious_genes(struct gene_data* gd, int nb_gene, struct option* opt){ int i, j; double meanl, meantl, meanntl, alpha, ratio_i, ratio_mean; @@ -379,15 +427,15 @@ void find_dubious_genes(struct gene_data* gd, int nb_gene){ //genes with exceedingly long branches a, b, c or d for(i=0;i<nb_gene;i++){ - if(has_long_branches(gd+i)) - gd[i].to_ignore=5; + if(gd[i].to_ignore==0 && has_long_branches(gd+i, opt)) + gd[i].to_ignore=7; } //genes with anomalously long/short reference l meanl=0.; for(i=0;i<nb_gene;i++) meanl+=gd[i].l; meanl/=nb_gene; for(i=0;i<nb_gene;i++){ - if(gd[i].l/meanl>MAX_ALPHA || meanl/gd[i].l>MAX_ALPHA) - gd[i].to_ignore=6; + if(gd[i].to_ignore==0 && (gd[i].l/meanl>opt->max_alphai || meanl/gd[i].l>opt->max_alphai)) + gd[i].to_ignore=8; } //clock departure 1: slow/fast triplet @@ -396,16 +444,16 @@ void find_dubious_genes(struct gene_data* gd, int nb_gene){ ratio_mean=meantl/meanntl; for(i=0;i<nb_gene;i++){ ratio_i=gd[i].triplet_l/gd[i].non_triplet_l; - if(ratio_i/ratio_mean>MAX_CLOCK || ratio_mean/ratio_i>MAX_CLOCK){ - gd[i].to_ignore=7; + if(gd[i].to_ignore==0 && (ratio_i/ratio_mean>opt->max_clock_ratio || ratio_mean/ratio_i>opt->max_clock_ratio)){ + gd[i].to_ignore=9; } } //clock departure 2: non-clock triplet for(i=0;i<nb_gene;i++){ - if(has_unequal_branches(gd+i)) - gd[i].to_ignore=8; + if(gd[i].to_ignore==0 && has_unequal_branches(gd+i, opt)) + gd[i].to_ignore=10; } @@ -424,9 +472,14 @@ void print_ignored(struct gene_data* gd, int nb_gene){ nbign_tot=0; for(i=1;i<NB_FILTERING_CASES;i++) nbign_tot+=nbign[i]; - printf("ignoring %d genes\n", nbign_tot); - printf("[%d missing triplet; %d no outgroup; %d unexpected topology; %d long branches; %d anomalous ref; %d non clock-like]\n", nbign[1], nbign[2], nbign[3]+nbign[4], nbign[5], nbign[6], nbign[7]+nbign[8]); - printf("analyzing %d genes\n", nb_gene-nbign_tot); + printf("ignoring %d genes: ", nbign_tot); + printf("%d missing triplet; %d no outgroup; %d unexpected topology; %d unexpected branch lengths\n", nbign[1], nbign[2], nbign[3]+nbign[4]+nbign[5]+nbign[6], nbign[7]+nbign[8]+ nbign[9]+nbign[10]); + if(nbign[3]+nbign[4]+nbign[5]+nbign[6]) printf("[unexpected topology: %d triplet paraphyly; %d wrong root; %d triplet+outgroup paraphyly; %d outgroup paraphyly]\n", nbign[3], nbign[4], nbign[5], nbign[6]); + if(nbign[10]>0) + printf("[unexpected branch lengths: %d long branch; %d extreme alpha_i; %d slow/fast triplet; %d non-clock triplet]\n", nbign[7], nbign[8], nbign[9], nbign[10]); //user is expert and has activated triplet-clock filter (not mentioned in the doc) + else + if(nbign[7]+nbign[8]+nbign[9]) printf("[unexpected branch lengths: %d long branch; %d extreme alpha_i; %d slow/fast triplet]\n", nbign[7], nbign[8], nbign[9]); //normal case, no triplet-clock filter + printf("analyzing %d gene trees\n", nb_gene-nbign_tot); } @@ -463,15 +516,55 @@ int set_unresolved_topology(struct gene_data* gd, int nb_gene, double min){ } - /* print_options */ void print_options(struct option * opt){ int i; + int optimize_all; + printf("HGT times (relative to tau1):"); for(i=0;i<opt->nb_HGT_times;i++) printf(" %f", opt->HGT_time_coeff[i]); printf("\n"); - printf("posterior threshold: %.3f\n", opt->posterior_threshold); + if(opt->nb_ILS_times>1){ + printf("ILS coalescence times (relative to theta/2):"); + for(i=0;i<opt->nb_ILS_times;i++) printf(" %f", opt->discr_expo_mean1[i]); + printf("\n"); + } + printf("topology requirements:"); + if(opt->require_triplet_other_monophyly) printf(" triplet+other monophyly"); + if(opt->require_outgroup_monophyly){ + if(opt->require_triplet_other_monophyly) printf(";"); + printf(" outgroup monophyly"); + } + if(!opt->require_triplet_other_monophyly && !opt->require_outgroup_monophyly) printf(" none"); + printf("\n"); + printf("min_d: %f\n", opt->min_d); + + printf("filter thresholds: max_alphai=%.3f, max_bl=%.1f, max_clock_ratio=%.3f\n", opt->max_alphai, opt->max_bl, opt->max_clock_ratio); + + //printf("optimization: max_iterations=%d, max_backward_moves=%d, convergence=%f\n", opt->max_iterations, opt->max_backward_moves, opt->convergence); + + optimize_all=(opt->optimize_tau1 || opt->optimize_tau2 || opt->optimize_theta || opt->optimize_pab || opt->optimize_pac || opt->optimize_pbc || opt->optimize_pone); + if(!optimize_all){ + printf("do not optimize "); + if(!opt->optimize_tau1) printf("tau1 [%f] ", opt->fixed_tau1); + if(!opt->optimize_tau2) printf("tau2 [%f] ", opt->fixed_tau2); + if(!opt->optimize_theta) printf("theta [%f] ", opt->fixed_theta); + if(!opt->optimize_pab) printf("pab [%f] ", opt->fixed_pab); + if(!opt->optimize_pac) printf("pac [%f] ", opt->fixed_pac); + if(!opt->optimize_pbc) printf("pbc [%f] ", opt->fixed_pbc); + if(!opt->optimize_pone) printf("pone [%f] ", opt->fixed_pone); + } + + printf("Confidence intervals:" ); + if(opt->do_ILS_GF_CI) printf("ILS/GF "); + if(opt->do_param_CI) printf("parameters "); + if(!opt->do_ILS_GF_CI && !opt->do_param_CI) printf("none"); + else printf("[%f]", opt->CI_max_lnL_diff); + printf("\n"); + + + //printf("posterior_threshold: %.3f\n", opt->posterior_threshold); } @@ -481,6 +574,17 @@ void set_default_options(struct option * opt){ opt->verbose=1; opt->posterior_threshold=DEFAULT_POSTERIOR_THRESHOLD; + opt->nb_ILS_times=DEFAULT_nb_ILS_times; + opt->strict_no_event=DEFAULT_STRICT; + opt->require_triplet_other_monophyly=DEFAULT_REQUIRE_TO_MONOPH; + opt->require_outgroup_monophyly=DEFAULT_REQUIRE_OUTG_MONOPH; + opt->max_alphai=DEFAULT_MAX_ALPHAI; + opt->max_bl=DEFAULT_MAX_BL; + opt->max_clock_ratio=DEFAULT_MAX_CLOCK_RATIO; + opt->triplet_clock_thresh=DEFAULT_TRIPLET_CLOCK_THRESH; + opt->max_backward_moves=DEFAULT_MAX_BACKWARD_MOVES; + opt->max_iterations=DEFAULT_MAX_ITERATIONS; + opt->convergence=DEFAULT_CONVERGENCE; opt->nb_HGT_times=DEFAULT_nb_HGT_times; opt->HGT_time_coeff=(double*)check_alloc(MAX_nb_HGT_times, sizeof(double)); opt->HGT_time_coeff[0]=DEFAULT_HGT_TIME_1; @@ -489,11 +593,54 @@ void set_default_options(struct option * opt){ opt->optimize_pab=opt->optimize_pac=opt->optimize_pbc=opt->optimize_pone=1; opt->fixed_tau1=opt->fixed_tau2=opt->fixed_theta=-1.; opt->fixed_pab=opt->fixed_pac=opt->fixed_pbc=opt->fixed_pone=-1.; - opt->min_d=-1.; - opt->do_ILS_GF_CI=1; - opt->do_param_CI=0; - opt->CI_max_lnL_diff=2.; + opt->min_d=DEFAULT_MIN_D; + opt->do_ILS_GF_CI=DEFAULT_DO_CI1; + opt->do_param_CI=DEFAULT_DO_CI2; + opt->CI_max_lnL_diff=DEFAULT_CI_THR; + +} + + + + +/* set_scenario_names */ +void set_scenario_names(struct option* opt){ + int i; + + scenario_name=(char**)check_alloc(nb_scenario_plus1, sizeof(char*)); + for(i=1;i<=nb_scenario_plus1;i++) scenario_name[i]=(char*)check_alloc(101, sizeof(char)); + sprintf(scenario_name[1], "no_event"); + if(opt->nb_ILS_times>1){ + for(i=0;i<opt->nb_ILS_times;i++){ + sprintf(scenario_name[2+i*3],"ILS_%.3f_((AB)C)", opt->discr_expo_mean1[i]); + sprintf(scenario_name[3+i*3],"ILS_%.3f_((AC)B)", opt->discr_expo_mean1[i]); + sprintf(scenario_name[4+i*3],"ILS_%.3f_((BC)A)", opt->discr_expo_mean1[i]); + } + } + else{ + sprintf(scenario_name[2],"ILS_((AB)C)"); + sprintf(scenario_name[3],"ILS_((AC)B)"); + sprintf(scenario_name[4],"ILS_((BC)A)"); + } + + if(opt->nb_HGT_times>1){ + for(i=0;i<opt->nb_HGT_times;i++){ + sprintf(scenario_name[2+3*opt->nb_ILS_times+i*5],"GF_%.3f_A<->B", opt->HGT_time_coeff[i]); + sprintf(scenario_name[3+3*opt->nb_ILS_times+i*5],"GF_%.3f_A->C", opt->HGT_time_coeff[i]); + sprintf(scenario_name[4+3*opt->nb_ILS_times+i*5],"GF_%.3f_C->A", opt->HGT_time_coeff[i]); + sprintf(scenario_name[5+3*opt->nb_ILS_times+i*5],"GF_%.3f_B->C", opt->HGT_time_coeff[i]); + sprintf(scenario_name[6+3*opt->nb_ILS_times+i*5],"GF_%.3f_C->B", opt->HGT_time_coeff[i]); + } + } + else{ + sprintf(scenario_name[2+3*opt->nb_ILS_times],"GF_A<->B"); + sprintf(scenario_name[3+3*opt->nb_ILS_times],"GF_A->C"); + sprintf(scenario_name[4+3*opt->nb_ILS_times],"GF_C->A"); + sprintf(scenario_name[5+3*opt->nb_ILS_times],"GF_B->C"); + sprintf(scenario_name[6+3*opt->nb_ILS_times],"GF_C->B"); + } + } @@ -551,19 +698,31 @@ void read_option_file(FILE* optf, struct option * opt){ while(fgets(line, MAXLLINE, optf)){ if(strncmp(line, "verbose", 7)==0) opt->verbose=read_1_param(line, "verbose"); if(strncmp(line, "posterior_threshold", 13)==0) opt->posterior_threshold=read_1_param(line, "posterior_threshold"); - if(strncmp(line, "nb_HGT_times", 11)==0) opt->nb_HGT_times=read_1_param(line, "nb_HGT_times"); - if(strncmp(line, "HGT_time_coeff", 11)==0) opt->HGT_time_coeff=read_n_param(line, "HGT_time_coeff", opt->nb_HGT_times); - if(strncmp(line, "tau1", 4)==0) {opt->optimize_tau1=0; opt->fixed_tau1=read_1_param(line, "tau1");} - if(strncmp(line, "tau2", 4)==0) {opt->optimize_tau2=0; opt->fixed_tau2=read_1_param(line, "tau2");} - if(strncmp(line, "theta", 5)==0) {opt->optimize_theta=0; opt->fixed_theta=read_1_param(line, "theta");} - if(strncmp(line, "pab", 3)==0) {opt->optimize_pab=0; opt->fixed_pab=read_1_param(line, "pab");} - if(strncmp(line, "pac", 3)==0) {opt->optimize_pac=0; opt->fixed_pac=read_1_param(line, "pac");} - if(strncmp(line, "pbc", 3)==0) {opt->optimize_pbc=0; opt->fixed_pbc=read_1_param(line, "pbc");} - if(strncmp(line, "pold", 4)==0) {opt->optimize_pone=0; opt->fixed_pone=read_1_param(line, "pold");} + if(strncmp(line, "nb_ILS_times", 11)==0) opt->nb_ILS_times=read_1_param(line, "nb_ILS_times"); + if(strncmp(line, "nb_GF_times", 11)==0) opt->nb_HGT_times=read_1_param(line, "nb_GF_times"); + if(strncmp(line, "GF_time_coeff", 11)==0) opt->HGT_time_coeff=read_n_param(line, "GF_time_coeff", opt->nb_HGT_times); + if(strncmp(line, "strict_no_event", 15)==0) opt->strict_no_event=read_1_param(line, "strict_no_event"); + if(strncmp(line, "require_triplet_other_monophyly", 30)==0) opt->require_triplet_other_monophyly=read_1_param(line, "require_triplet_other_monophyly"); + if(strncmp(line, "require_outgroup_monophyly", 25)==0) opt->require_outgroup_monophyly=read_1_param(line, "require_outgroup_monophyly"); + if(strncmp(line, "max_alphai", 9)==0) opt->max_alphai=read_1_param(line, "max_alphai"); + if(strncmp(line, "max_bl", 6)==0) opt->max_bl=read_1_param(line, "max_bl"); + if(strncmp(line, "max_clock_ratio", 15)==0) opt->max_clock_ratio=read_1_param(line, "max_clock_ratio"); + if(strncmp(line, "triplet_clock_thresh", 20)==0) opt->triplet_clock_thresh=read_1_param(line, "triplet_clock_thresh"); + if(strncmp(line, "max_backward_moves", 15)==0) opt->max_backward_moves=read_1_param(line, "max_backward_moves"); + if(strncmp(line, "max_iterations", 12)==0) opt->max_iterations=read_1_param(line, "max_iterations"); + if(strncmp(line, "convergence", 10)==0) opt->convergence=read_1_param(line, "convergence"); + if(strncmp(line, "fixed_tau1", 10)==0) {opt->fixed_tau1=read_1_param(line, "fixed_tau1"); if(opt->fixed_tau1>0.) opt->optimize_tau1=0;} + if(strncmp(line, "fixed_tau2", 10)==0) {opt->fixed_tau2=read_1_param(line, "fixed_tau2"); if(opt->fixed_tau2>0.) opt->optimize_tau2=0;} + if(strncmp(line, "fixed_theta", 11)==0) {opt->fixed_theta=read_1_param(line, "fixed_theta"); if(opt->fixed_theta>0.) opt->optimize_theta=0;} + if(strncmp(line, "fixed_pab", 9)==0) {opt->fixed_pab=read_1_param(line, "fixed_pab"); if(opt->fixed_pab>0.) opt->optimize_pab=0;} + if(strncmp(line, "fixed_pac", 9)==0) {opt->fixed_pac=read_1_param(line, "fixed_pac"); if(opt->fixed_pac>0.) opt->optimize_pac=0;} + if(strncmp(line, "fixed_pbc", 9)==0) {opt->fixed_pbc=read_1_param(line, "fixed_pbc"); if(opt->fixed_pbc>0.) opt->optimize_pbc=0;} + if(strncmp(line, "fixed_panc", 10)==0) {opt->fixed_pone=read_1_param(line, "fixed_panc"); if(opt->fixed_pone>0.) opt->optimize_pone=0;} if(strncmp(line, "min_d", 5)==0) opt->min_d=read_1_param(line, "min_d"); if(strncmp(line, "CI_ILS_GF", 9)==0) opt->do_ILS_GF_CI=read_1_param(line, "CI_ILS_GF"); if(strncmp(line, "CI_param", 8)==0) opt->do_param_CI=read_1_param(line, "CI_param"); } + } @@ -1501,10 +1660,20 @@ int is_in_tree(noeud nd, char* tax){ /* is_monophyletic_rooted */ int is_monophyletic_rooted(int** ttree, int nb, int nbbi, char** names, char** taxa, int nbtax){ - int i, j, n, in; + int i, j, k, n, in; if(nbtax==1) return 1; +/* +for(i=0;i<nbtax;i++) printf("%s\n", taxa[i]); +for(j=0;j<=nb;j++){ + for(i=0;i<nbbi;i++){ + printf("%d", ttree[j][i]); + } + printf(" %s\n", names[j]); +} +*/ + for(i=0;i<nbbi;i++){ n=0; for(j=1;j<nb+1;j++) n+=ttree[j][i]; @@ -1514,6 +1683,10 @@ int is_monophyletic_rooted(int** ttree, int nb, int nbbi, char** names, char** t if(in==1 && ttree[j][i]==0) break; if(in==0 && ttree[j][i]==1) break; } +/* +printf("branch %d:%d\n", i, j); +getchar(); +*/ if(j==nb+1) return 1; } @@ -1522,6 +1695,41 @@ int is_monophyletic_rooted(int** ttree, int nb, int nbbi, char** names, char** t +/* is_root_mrca_rooted */ +/* return 1 if the common ancestor to the two groups of taxa taxa1 and taxa2 is the tre root, 0 if not */ +/* inefficient version in o(n2) but called once per run so that's ok*/ +int is_root_mrca_rooted(int** ttree, int nb, int nbbi, char** names, char** taxa1, int nbtax1, char** taxa2, int nbtax2){ + + int i, j, k, n, n1; + + n=nbtax1+nbtax2; + + for(i=0;i<nbbi;i++){ + n1=0; + for(j=0;j<nbtax1;j++){ + for(k=1;k<=nb;k++){ + if(strcmp(names[k],taxa1[j])==0){ + n1+=ttree[k][i]; + break; + } + } + } + for(j=0;j<nbtax2;j++){ + for(k=1;k<=nb;k++){ + if(strcmp(names[k],taxa2[j])==0){ + n1+=ttree[k][i]; + break; + } + } + } + if(n1==n) return 0; + } + + return 1; +} + + + /* get_triplet_topol */ int get_triplet_topol(int** ttree, int nb, int nbbi, char** names, char** triplet){ @@ -1591,13 +1799,13 @@ void get_triplet_bl(int** ttree, int nb, int nbbi, char** names, double* lgbi, d /* get_triplet_topol_branch_lengths */ -void get_triplet_topol_branch_lengths(char* ctree, struct relevant_taxa* tax, struct gene_data* gd){ +void get_triplet_topol_branch_lengths(char* ctree, struct relevant_taxa* tax, struct option* opt, struct gene_data* gd){ int i, nb, nbtax, is_rooted, nbsp, nbbi; int noutg, noth; int **ttree; double* lgbi, *lgbp, *bootvals; - char** all_tax, **triplet_other, **names, rac; + char** all_tax, **triplet_other, **outgroup, **names, rac; noeud* stree; gd->to_ignore=0; @@ -1630,7 +1838,7 @@ void get_triplet_topol_branch_lengths(char* ctree, struct relevant_taxa* tax, st for(i=0;i<tax->nb_oth;i++) all_tax[3+tax->nb_outg+i]=tax->oth[i]; triplet_other=(char**)check_alloc(3+tax->nb_oth, sizeof(char*)); for(i=0;i<3;i++) triplet_other[i]=tax->triplet[i]; - for(i=0;i<tax->nb_oth;i++) triplet_other[3+i]=tax->oth[i]; + outgroup=(char**)check_alloc(tax->nb_outg, sizeof(char*)); //restrict tree /* restrict */ @@ -1644,10 +1852,25 @@ void get_triplet_topol_branch_lengths(char* ctree, struct relevant_taxa* tax, st } } - //check >=1 outgroup species + //count+identify other + noth=0; + for(i=0;i<tax->nb_oth;i++){ + if(is_in_tree(root, tax->oth[i])){ + triplet_other[3+noth]=tax->oth[i]; + noth++; + } + } + + //count+identify outgroup noutg=0; - for(i=0;i<tax->nb_outg;i++) - if(is_in_tree(root, tax->outg[i])) noutg++; + for(i=0;i<tax->nb_outg;i++){ + if(is_in_tree(root, tax->outg[i])){ + outgroup[noutg]=tax->outg[i]; + noutg++; + } + } + + //check >=1 outgroup species if(noutg==0){ gd->to_ignore=2; return; @@ -1662,16 +1885,28 @@ void get_triplet_topol_branch_lengths(char* ctree, struct relevant_taxa* tax, st return; } - //check triplet+other monophyly - if(is_monophyletic_rooted(ttree, nb, nbbi, names, triplet_other, 3+tax->nb_oth)==0){ + //check MRCA(triplet, outg)==root + if(is_root_mrca_rooted(ttree, nb, nbbi, names, tax->triplet, 3, outgroup, noutg)==0){ gd->to_ignore=4; + return; + } + + //check triplet+other monophyly (optional) + if(opt->require_triplet_other_monophyly && is_monophyletic_rooted(ttree, nb, nbbi, names, triplet_other, 3+noth)==0){ + gd->to_ignore=5; return; - } - - //get topology + } + + //check outgroup monophyly (optional) + if(opt->require_outgroup_monophyly && is_monophyletic_rooted(ttree, nb, nbbi, names, outgroup, noutg)==0){ + gd->to_ignore=6; + return; + } + + //get triplet topology gd->topology=get_triplet_topol(ttree, nb, nbbi, names, tax->triplet); - //get branch lengths + //get triplet branch lengths get_triplet_bl(ttree, nb, nbbi, names, lgbi, lgbp, tax->triplet, gd->topology, gd); } @@ -1731,7 +1966,7 @@ void calculate_root_to_tip(noeud nd){ /* read_data_file */ -struct gene_data* read_data_file(FILE* inf, struct relevant_taxa* tax, int* nbg){ +struct gene_data* read_data_file(FILE* inf, struct option* opt, struct relevant_taxa* tax, int* nbg){ struct gene_data* gdata; int i, nb_gene, iret, lline, maxlline, lgt; @@ -1780,19 +2015,19 @@ struct gene_data* read_data_file(FILE* inf, struct relevant_taxa* tax, int* nbg) if(ret==NULL) break; if(line[0]=='\n' || line[0]==' ' || line[0]=='\t' || line[0]=='#') continue; - tree=strtok(line, " \t\n"); + tree=strtok(line, "\t\n"); - lgs=strtok(NULL, " \t\n"); - if(lgs==NULL) fin("unexpected line in data file"); + lgs=strtok(NULL, "\t\n"); + if(lgs==NULL) fin("unexpected line in data file (1)"); iret=sscanf(lgs, "%d", &(gdata[i].lgseq)); - if(iret!=1) fin("unexpected line in data file"); + if(iret!=1) fin("unexpected line in data file (2)"); - name=strtok(NULL, " \t\n"); - if(name==NULL) fin("unexpected line in data file"); + name=strtok(NULL, "\t\n"); + if(name==NULL) fin("unexpected line in data file (3)"); gdata[i].name=(char*)check_alloc(strlen(name)+1, sizeof(char)); sprintf(gdata[i].name, "%s", name); - get_triplet_topol_branch_lengths(tree, tax, gdata+i); + get_triplet_topol_branch_lengths(tree, tax, opt, gdata+i); calculate_root_to_tip(root); all_rtt=root->av_l_to_tip; @@ -1862,17 +2097,19 @@ void set_topol_scenario(struct option opt){ topol_scenario[0][1]=1; //ILS - topol_scenario[0][2]=1; - topol_scenario[1][3]=1; - topol_scenario[2][4]=1; + for(i=0;i<opt.nb_ILS_times;i++){ + topol_scenario[0][2+3*i]=1; + topol_scenario[1][3+3*i]=1; + topol_scenario[2][4+3*i]=1; + } //HGT for(i=0;i<opt.nb_HGT_times;i++){ - topol_scenario[0][5+5*i]=1; - topol_scenario[1][6+5*i]=1; - topol_scenario[1][7+5*i]=1; - topol_scenario[2][8+5*i]=1; - topol_scenario[2][9+5*i]=1; + topol_scenario[0][3*opt.nb_ILS_times+2+5*i]=1; + topol_scenario[1][3*opt.nb_ILS_times+3+5*i]=1; + topol_scenario[1][3*opt.nb_ILS_times+4+5*i]=1; + topol_scenario[2][3*opt.nb_ILS_times+5+5*i]=1; + topol_scenario[2][3*opt.nb_ILS_times+6+5*i]=1; } //unresolved topology @@ -1995,7 +2232,7 @@ void set_starting(struct gene_data* gdata, int nbg, struct option opt, double th //par->pac=prop_topol[1]/2.; if(opt.optimize_pac==0) par->pac=opt.fixed_pac; //par->pbc=prop_topol[2]/2.; - if(opt.optimize_pbc==0) par->pac=opt.fixed_pbc; + if(opt.optimize_pbc==0) par->pbc=opt.fixed_pbc; //starting pab: mean of pac and pbc par->pab=(par->pac+par->pbc)/2.; @@ -2101,14 +2338,16 @@ void calculate_gene_likelihood(struct gene_data* gd, struct param* par, struct o //scenario probabilities p[1]=(1-pab-pac-pbc)*(1.-pils); - p[2]=p[3]=p[4]=(1-pab-pac-pbc)*pils/3; - p[5]=pab*pone; p[6]=p[7]=pac*pone/2.; p[8]=p[9]=pbc*pone/2.; + for(i=2;i<3*opt->nb_ILS_times+2;i++) + p[i]=(1-pab-pac-pbc)*pils/(3*opt->nb_ILS_times); + + p[3*opt->nb_ILS_times+2]=pab*pone; p[3*opt->nb_ILS_times+3]=p[3*opt->nb_ILS_times+4]=pac*pone/2.; p[3*opt->nb_ILS_times+5]=p[3*opt->nb_ILS_times+6]=pbc*pone/2.; for(i=1;i<opt->nb_HGT_times;i++){ - p[5+5*i]=pab*(1-pone)/(opt->nb_HGT_times-1); - p[6+5*i]=p[7+5*i]=pac*(1-pone)/(2.*(opt->nb_HGT_times-1)); - p[8+5*i]=p[9+5*i]=pbc*(1-pone)/(2.*(opt->nb_HGT_times-1)); + p[3*opt->nb_ILS_times+2+5*i]=pab*(1-pone)/(opt->nb_HGT_times-1); + p[3*opt->nb_ILS_times+3+5*i]=p[3*opt->nb_ILS_times+4+5*i]=pac*(1-pone)/(2.*(opt->nb_HGT_times-1)); + p[3*opt->nb_ILS_times+5+5*i]=p[3*opt->nb_ILS_times+6+5*i]=pbc*(1-pone)/(2.*(opt->nb_HGT_times-1)); } //scenario likelihoods and scenario likelihood derivatives wrt t1, t2, th @@ -2137,15 +2376,22 @@ void calculate_gene_likelihood(struct gene_data* gd, struct param* par, struct o } //gene likelihood derivatives wrt pab pac pbc pone - glh->dLdpab=-pr[1]*(1.-pils)-(pr[2]+pr[3]+pr[4])*pils/3+pr[5]*pone; - for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpab+=pr[5+5*i]*(1.-pone)/(opt->nb_HGT_times-1); - glh->dLdpac=-pr[1]*(1.-pils)-(pr[2]+pr[3]+pr[4])*pils/3+(pr[6]+pr[7])*pone/2.; - for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpac+=(pr[6+5*i]+pr[7+5*i])*(1.-pone)/(2*(opt->nb_HGT_times-1)); - glh->dLdpbc=-pr[1]*(1.-pils)-(pr[2]+pr[3]+pr[4])*pils/3+(pr[8]+pr[9])*pone/2.; - for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpbc+=(pr[8+5*i]+pr[9+5*i])*(1.-pone)/(2*(opt->nb_HGT_times-1)); - - glh->dLdpone=pr[5]*pab+(pr[6]+pr[7])*pac/2.+(pr[8]+pr[9])*pbc/2.; - for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpone-=((pr[5+5*i]*pab)+(pr[6+5*i]+pr[7+5*i])*pac/2.+(pr[8+5*i]+pr[9+5*i])*pbc/2.)/(opt->nb_HGT_times-1); + glh->dLdpab=-pr[1]*(1.-pils); + for(i=0;i<3*opt->nb_ILS_times;i++) glh->dLdpab-=(pr[2+i]*pils/(3*opt->nb_ILS_times)); + glh->dLdpab+=pr[2+3*opt->nb_ILS_times]*pone; + for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpab+=pr[2+3*opt->nb_ILS_times+5*i]*(1.-pone)/(opt->nb_HGT_times-1); + glh->dLdpac=-pr[1]*(1.-pils); + for(i=0;i<3*opt->nb_ILS_times;i++) glh->dLdpac-=(pr[2+i]*pils/(3*opt->nb_ILS_times)); + glh->dLdpac+=(pr[3+3*opt->nb_ILS_times]+pr[4+3*opt->nb_ILS_times])*pone/2.; + for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpac+=(pr[3+3*opt->nb_ILS_times+5*i]+pr[4+3*opt->nb_ILS_times+5*i])*(1.-pone)/(2*(opt->nb_HGT_times-1)); + glh->dLdpbc=-pr[1]*(1.-pils); + for(i=0;i<3*opt->nb_ILS_times;i++) glh->dLdpbc-=(pr[2+i]*pils/(3*opt->nb_ILS_times)); + glh->dLdpbc+=(pr[5+3*opt->nb_ILS_times]+pr[6+3*opt->nb_ILS_times])*pone/2.; + for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpbc+=(pr[5+3*opt->nb_ILS_times+5*i]+pr[6+3*opt->nb_ILS_times+5*i])*(1.-pone)/(2*(opt->nb_HGT_times-1)); + + glh->dLdpone=pr[2+3*opt->nb_ILS_times]*pab+(pr[3+3*opt->nb_ILS_times]+pr[4+3*opt->nb_ILS_times])*pac/2.+(pr[5+3*opt->nb_ILS_times]+pr[6+3*opt->nb_ILS_times])*pbc/2.; + for(i=1;i<opt->nb_HGT_times;i++) glh->dLdpone-=((pr[2+3*opt->nb_ILS_times+5*i]*pab)+(pr[3+3*opt->nb_ILS_times+5*i]+pr[4+3*opt->nb_ILS_times+5*i])*pac/2.+(pr[5+3*opt->nb_ILS_times+5*i]+pr[6+3*opt->nb_ILS_times+5*i])*pbc/2.)/(opt->nb_HGT_times-1); + glh->d2Ldpab=glh->d2Ldpac=glh->d2Ldpbc=glh->d2Ldpone=0; //gene likelihood derivatives wrt t1, t2, th (harder) @@ -2163,18 +2409,22 @@ void calculate_gene_likelihood(struct gene_data* gd, struct param* par, struct o dpdt1[1]=-(1-pab-pac-pbc)*dpilsdt1; dpdt2[1]=-(1-pab-pac-pbc)*dpilsdt2; dpdth[1]=-(1-pab-pac-pbc)*dpilsdth; - dpdt1[2]=dpdt1[3]=dpdt1[4]=(1-pab-pac-pbc)*dpilsdt1/3; - dpdt2[2]=dpdt2[3]=dpdt2[4]=(1-pab-pac-pbc)*dpilsdt2/3; - dpdth[2]=dpdth[3]=dpdth[4]=(1-pab-pac-pbc)*dpilsdth/3; - for(i=5;i<nb_scenario_plus1;i++) dpdt1[i]=dpdt2[i]=dpdth[i]=0.; + for(i=0;i<opt->nb_ILS_times;i++){ + dpdt1[2+3*i]=dpdt1[3+3*i]=dpdt1[4+3*i]=(1-pab-pac-pbc)*dpilsdt1/(3*opt->nb_ILS_times); + dpdt2[2+3*i]=dpdt2[3+3*i]=dpdt2[4+3*i]=(1-pab-pac-pbc)*dpilsdt2/(3*opt->nb_ILS_times); + dpdth[2+3*i]=dpdth[3+3*i]=dpdth[4+3*i]=(1-pab-pac-pbc)*dpilsdth/(3*opt->nb_ILS_times); + } + for(i=3*opt->nb_ILS_times+2;i<nb_scenario_plus1;i++) dpdt1[i]=dpdt2[i]=dpdth[i]=0.; d2pdt1[1]=-(1-pab-pac-pbc)*d2pilsdt1; d2pdt2[1]=-(1-pab-pac-pbc)*d2pilsdt2; d2pdth[1]=-(1-pab-pac-pbc)*d2pilsdth; - d2pdt1[2]=d2pdt1[3]=d2pdt1[4]=(1-pab-pac-pbc)*d2pilsdt1/3; - d2pdt2[2]=d2pdt2[3]=d2pdt2[4]=(1-pab-pac-pbc)*d2pilsdt2/3; - d2pdth[2]=d2pdth[3]=d2pdth[4]=(1-pab-pac-pbc)*d2pilsdth/3; - for(i=5;i<nb_scenario_plus1;i++) d2pdt1[i]=d2pdt2[i]=d2pdth[i]=0.; + for(i=0;i<opt->nb_ILS_times;i++){ + d2pdt1[2+3*i]=d2pdt1[3+3*i]=d2pdt1[4+3*i]=(1-pab-pac-pbc)*d2pilsdt1/(3*opt->nb_ILS_times); + d2pdt2[2+3*i]=d2pdt2[3+3*i]=d2pdt2[4+3*i]=(1-pab-pac-pbc)*d2pilsdt2/(3*opt->nb_ILS_times); + d2pdth[2+3*i]=d2pdth[3+3*i]=d2pdth[4+3*i]=(1-pab-pac-pbc)*d2pilsdth/(3*opt->nb_ILS_times); + } + for(i=3*opt->nb_ILS_times+2;i<nb_scenario_plus1;i++) d2pdt1[i]=d2pdt2[i]=d2pdth[i]=0.; //t1 glh->dLdt1=glh->d2Ldt1=0.; @@ -2224,37 +2474,53 @@ void set_expected_length(struct param* par, struct option* opt){ exp_lg_coeff[1][2][0]=0; exp_lg_coeff[1][2][1]=1; exp_lg_coeff[1][2][2]=0.5; exp_lg[1][3]=t2-t1; //d exp_lg_coeff[1][3][0]=-1; exp_lg_coeff[1][3][1]=1; exp_lg_coeff[1][3][2]=0; - - //scenario 2: ILS topology c - exp_lg[2][0]=t2+ths6; //a - exp_lg_coeff[2][0][0]=0; exp_lg_coeff[2][0][1]=1; exp_lg_coeff[2][0][2]=1./6.; - exp_lg[2][1]=t2+ths6; //b - exp_lg_coeff[2][1][0]=0; exp_lg_coeff[2][1][1]=1; exp_lg_coeff[2][1][2]=1./6.; - exp_lg[2][2]=t2+ths6+ths2; //c - exp_lg_coeff[2][2][0]=0; exp_lg_coeff[2][2][1]=1; exp_lg_coeff[2][2][2]=2./3.; - exp_lg[2][3]=ths2; //d - exp_lg_coeff[2][3][0]=0; exp_lg_coeff[2][3][1]=0; exp_lg_coeff[2][3][2]=0.5; - - //scenario 3: ILS topology b - exp_lg[3][0]=t2+ths6; //a - exp_lg_coeff[3][0][0]=0; exp_lg_coeff[3][0][1]=1; exp_lg_coeff[3][0][2]=1./6.; - exp_lg[3][1]=t2+ths6+ths2; //b - exp_lg_coeff[3][1][0]=0; exp_lg_coeff[3][1][1]=1; exp_lg_coeff[3][1][2]=2./3.; - exp_lg[3][2]=t2+ths6; //c - exp_lg_coeff[3][2][0]=0; exp_lg_coeff[3][2][1]=1; exp_lg_coeff[3][2][2]=1./6.; - exp_lg[3][3]=ths2; //d - exp_lg_coeff[3][3][0]=0; exp_lg_coeff[3][3][1]=0; exp_lg_coeff[3][3][2]=0.5; - - //scenario 4: ILS topology a - exp_lg[4][0]=t2+ths6+ths2; //a - exp_lg_coeff[4][0][0]=0; exp_lg_coeff[4][0][1]=1; exp_lg_coeff[4][0][2]=2./3.; - exp_lg[4][1]=t2+ths6; //b - exp_lg_coeff[4][1][0]=0; exp_lg_coeff[4][1][1]=1; exp_lg_coeff[4][1][2]=1./6.; - exp_lg[4][2]=t2+ths6; //c - exp_lg_coeff[4][2][0]=0; exp_lg_coeff[4][2][1]=1; exp_lg_coeff[4][2][2]=1./6.; - exp_lg[4][3]=ths2; //d - exp_lg_coeff[4][3][0]=0; exp_lg_coeff[4][3][1]=0; exp_lg_coeff[4][3][2]=0.5; - + + if(opt->strict_no_event && t1+ths2>t2){ + exp_lg[1][0]=t2; //a + exp_lg_coeff[1][0][0]=0; exp_lg_coeff[1][0][1]=1; exp_lg_coeff[1][0][2]=0; + exp_lg[1][1]=t2; //b + exp_lg_coeff[1][1][0]=0; exp_lg_coeff[1][1][1]=1; exp_lg_coeff[1][1][2]=0; + exp_lg[1][2]=t2+ths2; //c + exp_lg_coeff[1][2][0]=0; exp_lg_coeff[1][2][1]=1; exp_lg_coeff[1][2][2]=0.5; + exp_lg[1][3]=ths2; //d + exp_lg_coeff[1][3][0]=0; exp_lg_coeff[1][3][1]=0; exp_lg_coeff[1][3][2]=0.5; + } + + + for(i=0;i<opt->nb_ILS_times;i++){ + //scenario 2 modulo 3: ILS topology c + k=2+3*i; + exp_lg[k][0]=t2+ths6; //a + exp_lg_coeff[k][0][0]=0; exp_lg_coeff[k][0][1]=1; exp_lg_coeff[k][0][2]=1./6.; + exp_lg[k][1]=t2+ths6; //b + exp_lg_coeff[k][1][0]=0; exp_lg_coeff[k][1][1]=1; exp_lg_coeff[k][1][2]=1./6.; + exp_lg[k][2]=t2+ths6+ths2*opt->discr_expo_mean1[i]; //c + exp_lg_coeff[k][2][0]=0; exp_lg_coeff[k][2][1]=1; exp_lg_coeff[k][2][2]=1./6.+0.5*opt->discr_expo_mean1[i]; + exp_lg[k][3]=ths2*opt->discr_expo_mean1[i]; //d + exp_lg_coeff[k][3][0]=0; exp_lg_coeff[k][3][1]=0; exp_lg_coeff[k][3][2]=0.5*opt->discr_expo_mean1[i]; + + //scenario 3 modulo 3: ILS topology b + k=3+3*i; + exp_lg[k][0]=t2+ths6; //a + exp_lg_coeff[k][0][0]=0; exp_lg_coeff[k][0][1]=1; exp_lg_coeff[k][0][2]=1./6.; + exp_lg[k][1]=t2+ths6+ths2*opt->discr_expo_mean1[i]; //b + exp_lg_coeff[k][1][0]=0; exp_lg_coeff[k][1][1]=1; exp_lg_coeff[k][1][2]=1./6.+0.5*opt->discr_expo_mean1[i]; + exp_lg[k][2]=t2+ths6; //c + exp_lg_coeff[k][2][0]=0; exp_lg_coeff[k][2][1]=1; exp_lg_coeff[k][2][2]=1./6.; + exp_lg[k][3]=ths2*opt->discr_expo_mean1[i]; //d + exp_lg_coeff[k][3][0]=0; exp_lg_coeff[k][3][1]=0; exp_lg_coeff[k][3][2]=0.5*opt->discr_expo_mean1[i]; + + //scenario 4 modulo 3: ILS topology a + k=4+3*i; + exp_lg[k][0]=t2+ths6+ths2*opt->discr_expo_mean1[i]; //a + exp_lg_coeff[k][0][0]=0; exp_lg_coeff[k][0][1]=1; exp_lg_coeff[k][0][2]=1./6.+0.5*opt->discr_expo_mean1[i]; + exp_lg[k][1]=t2+ths6; //b + exp_lg_coeff[k][1][0]=0; exp_lg_coeff[k][1][1]=1; exp_lg_coeff[k][1][2]=1./6.; + exp_lg[k][2]=t2+ths6; //c + exp_lg_coeff[k][2][0]=0; exp_lg_coeff[k][2][1]=1; exp_lg_coeff[k][2][2]=1./6.; + exp_lg[k][3]=ths2*opt->discr_expo_mean1[i]; //d + exp_lg_coeff[k][3][0]=0; exp_lg_coeff[k][3][1]=0; exp_lg_coeff[k][3][2]=0.5*opt->discr_expo_mean1[i]; + } for(i=0;i<opt->nb_HGT_times;i++){ @@ -2263,7 +2529,7 @@ void set_expected_length(struct param* par, struct option* opt){ tg=par->tau1*coeff; //scenario 5 modulo 5: HGT A<->B - k=5+5*i; + k=2+3*opt->nb_ILS_times+5*i; exp_lg[k][0]=tg; //a exp_lg_coeff[k][0][0]=coeff; exp_lg_coeff[k][0][1]=0; exp_lg_coeff[k][0][2]=0; exp_lg[k][1]=tg; //b @@ -2274,7 +2540,7 @@ void set_expected_length(struct param* par, struct option* opt){ exp_lg_coeff[k][3][0]=-coeff; exp_lg_coeff[k][3][1]=1; exp_lg_coeff[k][3][2]=0.5; //scenario 6 modulo 5: HGT A->C - k=6+5*i; + k=3+3*opt->nb_ILS_times+5*i; exp_lg[k][0]=tg; //a exp_lg_coeff[k][0][0]=coeff; exp_lg_coeff[k][0][1]=0; exp_lg_coeff[k][0][2]=0; exp_lg[k][1]=t1+ths2; //b @@ -2285,7 +2551,7 @@ void set_expected_length(struct param* par, struct option* opt){ exp_lg_coeff[k][3][0]=1-coeff; exp_lg_coeff[k][3][1]=0; exp_lg_coeff[k][3][2]=0.5; //scenario 7 modulo 5: HGT C->A - k=7+5*i; + k=4+3*opt->nb_ILS_times+5*i; exp_lg[k][0]=tg; //a exp_lg_coeff[k][0][0]=coeff; exp_lg_coeff[k][0][1]=0; exp_lg_coeff[k][0][2]=0; exp_lg[k][1]=t2+ths2; //b @@ -2296,7 +2562,7 @@ void set_expected_length(struct param* par, struct option* opt){ exp_lg_coeff[k][3][0]=-coeff; exp_lg_coeff[k][3][1]=1; exp_lg_coeff[k][3][2]=0.5; //scenario 8 modulo 5: HGT B->C - k=8+5*i; + k=5+3*opt->nb_ILS_times+5*i; exp_lg[k][0]=t1+ths2; //a exp_lg_coeff[k][0][0]=1; exp_lg_coeff[k][0][1]=0; exp_lg_coeff[k][0][2]=0.5; exp_lg[k][1]=tg; //b @@ -2307,7 +2573,7 @@ void set_expected_length(struct param* par, struct option* opt){ exp_lg_coeff[k][3][0]=1-coeff; exp_lg_coeff[k][3][1]=0; exp_lg_coeff[k][3][2]=0.5; //scenario 9 modulo 5: HGT C->B - k=9+5*i; + k=6+3*opt->nb_ILS_times+5*i; exp_lg[k][0]=t2+ths2; //a exp_lg_coeff[k][0][0]=0; exp_lg_coeff[k][0][1]=1; exp_lg_coeff[k][0][2]=0.5; exp_lg[k][1]=tg; //b @@ -2431,17 +2697,16 @@ struct gene_likelihood* optimize_lnL_starting(struct gene_data* gdata, int nbg, // calculate starting L + derivatives lh=calculate_log_likelihood(glh, nbg); - if(opt.verbose) printf("starting lnL: %f\n", lh.lnL); + if(opt.verbose) {printf("starting lnL: %f -> ", lh.lnL); fflush(stdout);} // Newton Raphson loop it=0; nbdec=0; - while(it<MAX_ITERATION){ -//printf("%daaa\n", it); + while(it<opt.max_iterations){ if(nbdec==0) lnL=lh.lnL; - if(nbdec==MAX_BACKWARD_MOVES){ + if(nbdec==opt.max_backward_moves){ if(opt.verbose) printf("Optimization problem (too many backward moves), maximum lnL:%f\n", new_lnL); break; } @@ -2463,21 +2728,14 @@ struct gene_likelihood* optimize_lnL_starting(struct gene_data* gdata, int nbg, par.pab+=dpab; par.pac+=dpac; par.pbc+=dpbc; par.pone+=dpone; -//printf("%.10f %.10f %.10f %.10f %.10f %.10f %f\n", dt1, dt2, dth, dpab, dpac, dpbc, dpone); -//printf("%.12f %.12f\n", lh.dlnLdth, lh.d2lnLdth); -//getchar(); - apply_constraints(&par); -//printf("%dbbb\n", it); - set_expected_length(&par, &opt); for(i=0;i<nbg;i++){ calculate_gene_likelihood(gdata+i, &par, &opt, i, glh+i); } lh=calculate_log_likelihood(glh, nbg); new_lnL=lh.lnL; -//printf("%dccc\n", it); if(new_lnL<lnL) { //decreasing -lnL: parameters move back copy_param(&mem_par, &par); @@ -2487,12 +2745,12 @@ struct gene_likelihood* optimize_lnL_starting(struct gene_data* gdata, int nbg, it++; - if(new_lnL-lnL<CONVERGENCE){ + if(new_lnL-lnL<opt.convergence){ if(opt.verbose) printf("Maximum lnL:%f\n", new_lnL); break; } - if(it>MAX_ITERATION){ + if(it>opt.max_iterations){ if(opt.verbose) printf("Optimization problem (too many iterationss), maximum lnL:%f\n", new_lnL); break; } @@ -2573,7 +2831,8 @@ void annotate_genes(struct gene_likelihood* glh, int nb_gene, struct option opt) sum_post+=glh[i].posterior_scenar[j]; } for(j=1;j<nb_scenario_plus1;j++) glh[i].posterior_scenar[j]/=sum_post; - glh[i].posterior_ILS=glh[i].posterior_scenar[2]+glh[i].posterior_scenar[3]+glh[i].posterior_scenar[4]; + glh[i].posterior_ILS=0.; + for(j=0;j<opt.nb_ILS_times;j++) glh[i].posterior_ILS+=glh[i].posterior_scenar[2+3*j]+glh[i].posterior_scenar[3+3*j]+glh[i].posterior_scenar[4+3*j]; glh[i].posterior_HGT=1.-glh[i].posterior_ILS-glh[i].posterior_scenar[1]; glh[i].predicted_scenario=glh[i].predicted_scenario_type=-1; @@ -2605,16 +2864,24 @@ void annotate_global(struct gene_likelihood* glh, int nb_gene, struct posterior* post->scenar[j]/=nb_gene; post->basic=post->scenar[1]; - post->noconflict_ILS=post->scenar[2]; - post->conflict_ILS=post->scenar[3]+post->scenar[4]; + post->noconflict_ILS=0.; + for(j=0;j<opt.nb_ILS_times;j++) post->noconflict_ILS+=post->scenar[2+3*j]; + post->conflict_ILS=0; + for(j=0;j<opt.nb_ILS_times;j++) post->conflict_ILS+=post->scenar[3+3*j]+post->scenar[4+3*j]; post->noconflict_HGT=0; - for(j=0;j<opt.nb_HGT_times;j++) post->noconflict_HGT+=post->scenar[5+5*j]; + for(j=0;j<opt.nb_HGT_times;j++) post->noconflict_HGT+=post->scenar[2+3*opt.nb_ILS_times+5*j]; post->conflict_HGT=0; - for(j=0;j<opt.nb_HGT_times;j++) post->conflict_HGT+=post->scenar[6+5*j]+post->scenar[7+5*j]+post->scenar[8+5*j]+post->scenar[9+5*j]; - post->asymmetry_ILS=post->scenar[3]/post->conflict_ILS; + for(j=0;j<opt.nb_HGT_times;j++) post->conflict_HGT+=post->scenar[3+3*opt.nb_ILS_times+5*j]+post->scenar[4+3*opt.nb_ILS_times+5*j]+post->scenar[5+3*opt.nb_ILS_times+5*j]+post->scenar[6+3*opt.nb_ILS_times+5*j]; + post->asymmetry_ILS=0; + for(j=0;j<opt.nb_ILS_times;j++) post->asymmetry_ILS+=post->scenar[3]; + post->asymmetry_ILS/=post->conflict_ILS; + if(post->asymmetry_ILS<0.5) {post->asymmetry_ILS=1.-post->asymmetry_ILS; post->dominant_ILS=2;} + else post->dominant_ILS=1; post->asymmetry_HGT=0; - for(j=0;j<opt.nb_HGT_times;j++) post->asymmetry_HGT+=post->scenar[6+5*j]+post->scenar[7+5*j]; + for(j=0;j<opt.nb_HGT_times;j++) post->asymmetry_HGT+=post->scenar[3+3*opt.nb_ILS_times+5*j]+post->scenar[4+3*opt.nb_ILS_times+5*j]; post->asymmetry_HGT/=post->conflict_HGT; + if(post->asymmetry_HGT<0.5) {post->asymmetry_HGT=1.-post->asymmetry_HGT; post->dominant_HGT=2;} + else post->dominant_HGT=1; } @@ -2632,7 +2899,8 @@ int calculate_ci(struct gene_data* gdata, int nb_gene, struct option opt, struct struct gene_likelihood* glh; struct param par, start_par; struct posterior post; - + + verb=opt.verbose; //identify master parameter @@ -2715,6 +2983,50 @@ int calculate_ci(struct gene_data* gdata, int nb_gene, struct option opt, struct +/* count_topol */ +void count_topol(struct gene_data* gdata, int nb_gene, int* ntopo0, int* ntopo1, int* ntopo2, int* ntopo3){ + + int i; + + *ntopo1=*ntopo2=*ntopo3=*ntopo0=0; + for(i=0;i<nb_gene;i++){ + if(gdata[i].topology==0) (*ntopo0)++; + else if(gdata[i].topology==1) (*ntopo1)++; + else if(gdata[i].topology==2) (*ntopo2)++; + else if(gdata[i].topology==3) (*ntopo3)++; + } +} + + +/* av_branch_lengths */ +void av_branch_lengths(struct gene_data* gdata, int nb_gene, double* ava, double* avb, double* avc, double* avd){ + + int i; + + *ava=*avb=*avc=*avd=0; + for(i=0;i<nb_gene;i++){ + *ava+=gdata[i].a; + *avb+=gdata[i].b; + *avc+=gdata[i].c; + *avd+=gdata[i].d; + } + (*ava)/=nb_gene; + (*avb)/=nb_gene; + (*avc)/=nb_gene; + (*avd)/=nb_gene; +} + + +/* ctopol */ +char* ctopol(int to){ + + if(to==0) return "((AB)C)"; + if(to==1) return "((AC)B)"; + if(to==2) return "((BC)A)"; + return ""; +} + + /* main */ int main(int argc, char** argv){ @@ -2725,23 +3037,26 @@ int main(int argc, char** argv){ struct posterior post; struct confidence_interval* ci; int nb_gene, nb_ignored, nb_sp, reti, nbok, nbci, i, j; + int ntopo1, ntopo2, ntopo3, ntopo0; double max_lnL; double opt_pab, opt_pac, opt_pbc, opt_pils; int npred_ils, npred_hgt, nils_a, nils_b, nhgt_a, nhgt_b; - double av_lg, sum_post, thresh; + double av_lg, av_a, av_b, av_c, av_d, sum_post, thresh; struct gene_likelihood* glh; FILE* inf, *optf, *taxf, *outf; - if(argc!=5) usage(); - set_log_facto(MAX_BL+1); + if(argc!=5) usage(); //read option file + optf=fopen(argv[3], "r"); if(optf==NULL) {printf("Cannot open file: %s\n", argv[3]); exit(0);} read_option_file(optf, &opt); - if(opt.verbose){ - printf("\n"); + opt.discr_expo_mean1=discretize_exponential(1, opt.nb_ILS_times); + set_log_facto((int)opt.max_bl+1); + + if(opt.verbose){ printf("Reading option file...\n"); print_options(&opt); printf("\n"); @@ -2750,8 +3065,8 @@ int main(int argc, char** argv){ //read taxon file if(opt.verbose) printf("Reading taxon file...\n"); taxf=fopen(argv[2], "r"); - if(optf==NULL) {printf("Cannot open file: %s\n", argv[2]); exit(0);} - read_taxon_file(taxf, &tax); + if(taxf==NULL) {printf("Cannot open file: %s\n", argv[2]); exit(0);} + read_taxon_file(taxf, &tax); if(opt.verbose){ print_taxa(&tax); printf("\n"); @@ -2761,22 +3076,29 @@ int main(int argc, char** argv){ inf=fopen(argv[1], "r"); if(inf==NULL) {printf("Cannot open file: %s\n", argv[1]); exit(0);} if(opt.verbose) printf("Reading data file...\n"); - gdata=read_data_file(inf, &tax, &nb_gene); + gdata=read_data_file(inf, &opt, &tax, &nb_gene); if(opt.verbose) printf("found %d gene trees\n", nb_gene); set_unresolved_topology(gdata, nb_gene, opt.min_d); - find_dubious_genes(gdata, nb_gene); + find_dubious_genes(gdata, nb_gene, &opt); if(opt.verbose) print_ignored(gdata, nb_gene); do_ignore(gdata, &nb_gene); + + // data summary + count_topol(gdata, nb_gene, &ntopo0, &ntopo1, &ntopo2, &ntopo3); + if(opt.verbose) printf("((AB)C):%d [%.1f%%] ((AC)B):%d [%.1f%%] ((BC)A):%d [%.1f%%] (ABC):%d [%.1f%%]\n", ntopo0, 100.*(double)ntopo0/nb_gene, ntopo1, 100.*(double)ntopo1/nb_gene, ntopo2, 100.*(double)ntopo2/nb_gene, ntopo3, 100.*(double)ntopo3/nb_gene); av_lg=0.; for(i=0;i<nb_gene;i++) av_lg+=gdata[i].lgseq; av_lg/=nb_gene; if(opt.verbose) printf("average sequence length: %.1f\n", av_lg); + av_branch_lengths(gdata, nb_gene, &av_a, &av_b, &av_c, &av_d); + if(opt.verbose) printf("average branch lengths: a=%f, b=%f, c=%f, d=%f\n", av_a, av_b, av_c, av_d); if(opt.verbose) printf("\n"); - + //initialize set_parameter_names(); - nb_scenario_plus1=4+5*opt.nb_HGT_times+1; + nb_scenario_plus1=1+3*opt.nb_ILS_times+5*opt.nb_HGT_times+1; allocate_global_var(); + set_scenario_names(&opt); set_topol_scenario(opt); // optimize lnL @@ -2819,12 +3141,12 @@ int main(int argc, char** argv){ if(outf==NULL) {printf("Cannot write file\n"); exit(0);} fprintf(outf, "gene,topology,a,b,c,d,l,lgseq,"); - for(j=1;j<nb_scenario_plus1;j++) fprintf(outf, "p_scenar_%d,", j); - fprintf(outf, "p_ILS,p_HGT,predicted_scenario,predicted_scenario_type\n"); + for(j=1;j<nb_scenario_plus1;j++) fprintf(outf, "%s,", scenario_name[j]); + fprintf(outf, "p_ILS,p_HGT\n"); for(i=0;i<nb_gene;i++){ - fprintf(outf, "%s,%d,%f,%f,%f,%f,%f,%d,", gdata[i].name, gdata[i].topology, gdata[i].a, gdata[i].b, gdata[i].c, gdata[i].d, gdata[i].l, gdata[i].lgseq); + fprintf(outf, "%s,%s,%f,%f,%f,%f,%f,%d,", gdata[i].name, ctopol(gdata[i].topology), gdata[i].a, gdata[i].b, gdata[i].c, gdata[i].d, gdata[i].l, gdata[i].lgseq); for(j=1;j<nb_scenario_plus1;j++) fprintf(outf, "%f,", glh[i].posterior_scenar[j]); - fprintf(outf, "%f,%f,%d,%d\n", glh[i].posterior_ILS, glh[i].posterior_HGT, glh[i].predicted_scenario, glh[i].predicted_scenario_type); + fprintf(outf, "%f,%f\n", glh[i].posterior_ILS, glh[i].posterior_HGT); } if(opt.verbose) printf("\n"); @@ -2852,38 +3174,38 @@ int main(int argc, char** argv){ //output summary if(opt.verbose){ - printf("*** Summary results: ***\n"); + printf("*** Summary results: ***\n\n"); printf("maximum likelihood: %.3f\n", max_lnL); printf("optimized parameters:\n"); print_par(&optimal_par, ci, nbci); printf("\n"); - printf("posterior basic: %.3f\n", post.basic); + printf("posterior no-event: %.3f\n", post.basic); printf("posterior no-conflict ILS: %.3f\n", post.noconflict_ILS); - printf("posterior no-conflict HGT: %.3f\n", post.noconflict_HGT); + printf("posterior no-conflict GF : %.3f\n\n", post.noconflict_HGT); + printf("posterior conflict ILS: %.3f", post.conflict_ILS); if(nbci==1) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_ILS, ci[0].post_max.conflict_ILS); if(nbci==nb_parameters) printf(" [%.3f-%.3f]", ci[2].post_min.conflict_ILS, ci[2].post_max.conflict_ILS); printf("\n"); - printf("posterior conflict HGT: %.3f", post.conflict_HGT); + printf("posterior conflict GF : %.3f", post.conflict_HGT); if(nbci==1) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_HGT, ci[0].post_max.conflict_HGT); if(nbci==nb_parameters) printf(" [%.3f-%.3f]", ci[0].post_min.conflict_HGT, ci[0].post_max.conflict_HGT); - printf("\n"); - printf("posterior asymmetry ILS: %.3f\n", post.asymmetry_ILS); - printf("posterior asymmetry HGT: %.3f\n", post.asymmetry_HGT); + printf("\n\n"); + printf("posterior imbalance ILS: %.3f [%s dominant]\n", post.asymmetry_ILS, ctopol(post.dominant_ILS)); + printf("posterior imbalance GF : %.3f [%s dominant]\n", post.asymmetry_HGT, ctopol(post.dominant_HGT)); printf("\n"); - thresh=opt.posterior_threshold; +/* thresh=opt.posterior_threshold; printf("%d genes with probability of ILS above %.3f \n", npred_ils, thresh); printf("%d genes with probability of HGT above %.3f \n", npred_hgt, thresh); - - printf("Among ILS-conflicting genes, %d have topology ((A,C),B), %d have topology ((B,C),A)\n", nils_b, nils_a); - printf("Among HGT-conflicting genes, %d have topology ((A,C),B), %d have topology ((B,C),A)\n", nhgt_b, nhgt_a); printf("\n"); +*/ } //one-line output - if(!opt.verbose) printf("%d,%.1f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f\n", nb_gene, av_lg, optimal_par.tau1, optimal_par.tau2, optimal_par.theta, optimal_par.pab, optimal_par.pac, optimal_par.pbc, optimal_par.pone, post.basic, post.noconflict_ILS, post.noconflict_HGT, post.conflict_ILS, post.conflict_HGT, post.asymmetry_ILS, post.asymmetry_HGT); + if(!opt.verbose) printf("%d,%d,%d,%d,%d,%.1f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%f,%s,%f,%s,%f\n", nb_gene, ntopo0, ntopo1, ntopo2, ntopo3, av_lg, optimal_par.tau1, optimal_par.tau2, optimal_par.theta, optimal_par.pab, optimal_par.pac, optimal_par.pbc, optimal_par.pone, post.basic, post.noconflict_ILS, post.noconflict_HGT, post.conflict_ILS, post.conflict_HGT, post.asymmetry_ILS, ctopol(post.dominant_ILS), post.asymmetry_HGT, ctopol(post.dominant_HGT), max_lnL); + }