From 65909d0575a50cb35d7c51c9112bfe61eb7aeae2 Mon Sep 17 00:00:00 2001 From: David Emms <david_emms@hotmail.com> Date: Wed, 7 Sep 2016 14:42:14 +0100 Subject: [PATCH] Refactor output files --- README.md | 22 ++++---- ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...ps.csv => Orthogroups_SpeciesOverlaps.csv} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 .../AddOneRemoveOne/Statistics_Overall.csv | 6 +-- ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...ps.csv => Orthogroups_SpeciesOverlaps.csv} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 .../AddOneSpecies/Statistics_Overall.csv | 6 +-- ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...ps.csv => Orthogroups_SpeciesOverlaps.csv} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 .../AddTwoSpecies/Statistics_Overall.csv | 6 +-- ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 ...{OrthologousGroups.csv => Orthogroups.csv} | 0 ...{OrthologousGroups.txt => Orthogroups.txt} | 0 ...es.csv => Orthogroups_UnassignedGenes.csv} | 0 Tests/test_orthofinder.py | 52 +++++++++---------- ...FP671138.faa => Mycoplasma_agalactiae.faa} | 0 ...15450.faa => Mycoplasma_gallisepticum.faa} | 0 ...7_L43967.faa => Mycoplasma_genitalium.faa} | 0 ...17243.faa => Mycoplasma_hyopneumoniae.faa} | 0 orthofinder/orthofinder.py | 37 ++++++------- orthofinder/scripts/get_orthologues.py | 40 +++++++++----- orthofinder/scripts/util.py | 10 ++-- 39 files changed, 95 insertions(+), 84 deletions(-) rename Tests/ExpectedOutput/AddOneRemoveOne/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/AddOneRemoveOne/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/AddOneRemoveOne/{OrthologousGroups_SpeciesOverlaps.csv => Orthogroups_SpeciesOverlaps.csv} (100%) rename Tests/ExpectedOutput/AddOneRemoveOne/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename Tests/ExpectedOutput/AddOneSpecies/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/AddOneSpecies/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/AddOneSpecies/{OrthologousGroups_SpeciesOverlaps.csv => Orthogroups_SpeciesOverlaps.csv} (100%) rename Tests/ExpectedOutput/AddOneSpecies/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename Tests/ExpectedOutput/AddTwoSpecies/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/AddTwoSpecies/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/AddTwoSpecies/{OrthologousGroups_SpeciesOverlaps.csv => Orthogroups_SpeciesOverlaps.csv} (100%) rename Tests/ExpectedOutput/AddTwoSpecies/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename Tests/ExpectedOutput/RemoveFirstSpecies/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/RemoveFirstSpecies/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/RemoveFirstSpecies/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename Tests/ExpectedOutput/RemoveLastSpecies/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/RemoveLastSpecies/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/RemoveLastSpecies/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename Tests/ExpectedOutput/RemoveMiddleSpecies/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/RemoveMiddleSpecies/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/RemoveMiddleSpecies/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename Tests/ExpectedOutput/SmallExampleDataset/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/SmallExampleDataset/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/SmallExampleDataset/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename Tests/ExpectedOutput/SmallExampleDataset_I1.8/{OrthologousGroups.csv => Orthogroups.csv} (100%) rename Tests/ExpectedOutput/SmallExampleDataset_I1.8/{OrthologousGroups.txt => Orthogroups.txt} (100%) rename Tests/ExpectedOutput/SmallExampleDataset_I1.8/{OrthologousGroups_UnassignedGenes.csv => Orthogroups_UnassignedGenes.csv} (100%) rename orthofinder/ExampleDataset/{Mycoplasma_agalactiae_5632_FP671138.faa => Mycoplasma_agalactiae.faa} (100%) rename orthofinder/ExampleDataset/{Mycoplasma_gallisepticum_uid409_AE015450.faa => Mycoplasma_gallisepticum.faa} (100%) rename orthofinder/ExampleDataset/{Mycoplasma_genitalium_uid97_L43967.faa => Mycoplasma_genitalium.faa} (100%) rename orthofinder/ExampleDataset/{Mycoplasma_hyopneumoniae_AE017243.faa => Mycoplasma_hyopneumoniae.faa} (100%) diff --git a/README.md b/README.md index 6b3fe1b..07c3e78 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ What's New ========== **Sep. 2016**: OrthoFinder now infers the **gene trees** for the orthogroups, the **rooted species tree**, all **orthologues** between all species and calculates summary statistics. -**Jul. 2016**: OrthoFinder now outputs **summary statistics** for the orthogroups produced. Statistics are in the files **Statistics_Overall.csv, Statistics_PerSpecies.csv** and **OrthologousGroups_SpeciesOverlaps.csv**. +**Jul. 2016**: OrthoFinder now outputs **summary statistics** for the orthogroups produced. Statistics are in the files **Statistics_Overall.csv, Statistics_PerSpecies.csv** and **Orthogroups_SpeciesOverlaps.csv**. **Jul. 2016**: Provided **standalone binaries** for those without access to python (download the package from OrthoFinder's GitHub **releases tab**). @@ -61,17 +61,17 @@ An orthogroup is the set of genes that are descended from a single gene in the l OrthoFinder generates three output files for orthogroups: -**1) OrthologousGroups.csv** is a tab separated text file. Each row comprises a single orthogroup and contains all the genes that belong to that orthogroup. The genes are organized into separate columns where each column corresponds to a single species. +**1) Orthogroups.csv** is a tab separated text file. Each row comprises a single orthogroup and contains all the genes that belong to that orthogroup. The genes are organized into separate columns where each column corresponds to a single species. -**2) OrthologousGroups.txt** is a tab separated text file that is identical in format to the output file from OrthoMCL. This enables OrthoFinder to easily slot into existing bioinformatic pipelines. +**2) Orthogroups.txt** is a tab separated text file that is identical in format to the output file from OrthoMCL. This enables OrthoFinder to easily slot into existing bioinformatic pipelines. -**3) OrthologousGroups_UnassignedGenes.csv** is a tab separated text file that is identical in format to OrthologousGroups.csv but contains all of the genes that were not assigned to any orthogroup. +**3) Orthogroups_UnassignedGenes.csv** is a tab separated text file that is identical in format to Orthogroups.csv but contains all of the genes that were not assigned to any orthogroup. **4) Statistics_Overall.csv** is a tab separated text file giving statistics for the orthogroups. **5) Statistics_PerSpecies.csv** is a tab separated text file giving statistics for the orthogroups on a species-by-species basis. -**6) OrthologousGroups_SpeciesOverlaps.csv** is a tab separated text file containing a matrix of the number of orthogroups shared by each species-pair (i.e. the number of orthogroups which contain at least one gene from each of the species-pairs) +**6) Orthogroups_SpeciesOverlaps.csv** is a tab separated text file containing a matrix of the number of orthogroups shared by each species-pair (i.e. the number of orthogroups which contain at least one gene from each of the species-pairs) ###Statistics Files Most of the terms in the files **Statistics_Overall.csv** and **Statistics_PerSpecies.csv** are self-explanatory, the remainder are defined below: @@ -121,7 +121,7 @@ Executables are found here ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/L MCL --- -mcl is available in the repositories for some linux distributions and so can be installed in the same way as any other package. E.g. on Ubuntu "sudo apt-get install mcl". Alternatively it can be built from source which will likely require the build-essential or equivalent package on the Linux distribution being used. Instructions are provided on the MCL webpage. +mcl is available in the repositories for some linux distributions and so can be installed in the same way as any other package. E.g. on Ubuntu "sudo apt-get install mcl". Alternatively it can be built from source which will likely require the build-essential or equivalent package on the Linux distribution being used. Instructions are provided on the MCL webpage. FastME ------ @@ -138,7 +138,7 @@ Once the required dependencies have been installed, OrthoFinder can be setup and 1. Save OrthoFinder-master.zip and unpack it 2. Open a terminal and cd into the directory OrthoFinder-master 3. python orthofinder.py -f ExampleDataset/ -4. If everything was successful the output generated will end with a line giving the location of the results file containing the orthologous groups. +4. If everything was successful the output generated will end with a line giving the location of the results file containing the orthogroups. The command for running OrthoFinder on any dataset is: @@ -245,7 +245,7 @@ BLAST results files ------------------- For each species pair x, y there should be a BLAST results file Blastx_y.txt where x is the index of the query fasta file and y is the index of the species used for the database. Similarly, there should be a BLAST results file Blasty_x.txt where y is the index of the query fasta file and x is the index of the species used for the database. The tabular BLAST output format 6 should be used. The query and hit IDs in the BLAST results files should correspond to the IDs in the fasta files. -**Aside, reducing BLAST computations:** Note that since the BLAST queries are by far the most computationally expensive step, considerable time could be saved by only performing n(n+1)/2 of the species versus species BLAST queries instead of n^2, where n is the number of species. This would be done by only searching Species<x>.fa against the BLAST database generated from Species<y>.fa if x <= y. The results would give the file Blastx_y.txt and then this file could be used to generate the Blasty_x.txt file by swapping the query and hit sequence on each line in the results file. This should have only a small effect on the generated orthologous groups. +**Aside, reducing BLAST computations:** Note that since the BLAST queries are by far the most computationally expensive step, considerable time could be saved by only performing n(n+1)/2 of the species versus species BLAST queries instead of n^2, where n is the number of species. This would be done by only searching Species<x>.fa against the BLAST database generated from Species<y>.fa if x <= y. The results would give the file Blastx_y.txt and then this file could be used to generate the Blasty_x.txt file by swapping the query and hit sequence on each line in the results file. This should have only a small effect on the generated orthogroups. SequenceIDs.txt --------------- @@ -280,9 +280,9 @@ Orthobench with pre-computed BLAST results The BLAST pre-calculated BLAST results files etc. for the Orthobench dataset are available for download as are the original fasta files. -Output orthologous groups using the orthoxml format +Output orthogroups using the orthoxml format =================================================== -Orthologous groups can be output using the orthoxml format. This is requested by adding '-x speciesInfoFilename' to the command used to call orthofinder, where speciesInfoFilename should be the filename (including the path if necessary) of a user prepared file providing the information about the species that is required by the orthoxml format. This file should contain one line per species and each line should contain the following 5 fields separated by tabs: +Orthogroups can be output using the orthoxml format. This is requested by adding '-x speciesInfoFilename' to the command used to call orthofinder, where speciesInfoFilename should be the filename (including the path if necessary) of a user prepared file providing the information about the species that is required by the orthoxml format. This file should contain one line per species and each line should contain the following 5 fields separated by tabs: 1. **fasta filename**: the filename (without path) of the fasta file for the species described on this line 2. **species name**: the name of the species @@ -299,7 +299,7 @@ Information on the orthoxml format can be found here: http://orthoxml.org/0.3/or Trees for Orthogroups ===================== -The 'trees_from_MSA.py' utility will automatically generate multiple sequence alignments and gene trees for each orthologous group generated by OrthoFinder. For example, once OrthoFinder has been run on the example dataset, trees_from_MSA can be run using: +The 'trees_from_MSA.py' utility will automatically generate multiple sequence alignments and gene trees for each orthogroup generated by OrthoFinder. For example, once OrthoFinder has been run on the example dataset, trees_from_MSA can be run using: **python trees_from_MSA.py ExampleDataset/Results_\<date\> -t 16** diff --git a/Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups.csv b/Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups.csv rename to Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups.csv diff --git a/Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups.txt b/Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups.txt rename to Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups.txt diff --git a/Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups_SpeciesOverlaps.csv b/Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups_SpeciesOverlaps.csv similarity index 100% rename from Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups_SpeciesOverlaps.csv rename to Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups_SpeciesOverlaps.csv diff --git a/Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/AddOneRemoveOne/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/AddOneRemoveOne/Orthogroups_UnassignedGenes.csv diff --git a/Tests/ExpectedOutput/AddOneRemoveOne/Statistics_Overall.csv b/Tests/ExpectedOutput/AddOneRemoveOne/Statistics_Overall.csv index 2f040c7..0141814 100644 --- a/Tests/ExpectedOutput/AddOneRemoveOne/Statistics_Overall.csv +++ b/Tests/ExpectedOutput/AddOneRemoveOne/Statistics_Overall.csv @@ -16,11 +16,11 @@ O50 (all genes) 310 Number of orthogroups with all species present 279 Number of single-copy orthogroups 257 Date 2016-07-15 -Orthogroups file OrthologousGroups.csv -Unassigned genes file OrthologousGroups_UnassignedGenes.csv +Orthogroups file Orthogroups.csv +Unassigned genes file Orthogroups_UnassignedGenes.csv Per-species statistics Statistics_PerSpecies.csv Overall statistics Statistics_Overall.csv -Orthogroups shared between species OrthologousGroups_SpeciesOverlaps.csv +Orthogroups shared between species Orthogroups_SpeciesOverlaps.csv Average number of genes per-species in orthogroup Number of orthogroups Percentage of orthogroups Number of genes Percentage of genes <1 126 29.5 252 20.7 diff --git a/Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups.csv b/Tests/ExpectedOutput/AddOneSpecies/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups.csv rename to Tests/ExpectedOutput/AddOneSpecies/Orthogroups.csv diff --git a/Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups.txt b/Tests/ExpectedOutput/AddOneSpecies/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups.txt rename to Tests/ExpectedOutput/AddOneSpecies/Orthogroups.txt diff --git a/Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups_SpeciesOverlaps.csv b/Tests/ExpectedOutput/AddOneSpecies/Orthogroups_SpeciesOverlaps.csv similarity index 100% rename from Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups_SpeciesOverlaps.csv rename to Tests/ExpectedOutput/AddOneSpecies/Orthogroups_SpeciesOverlaps.csv diff --git a/Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/AddOneSpecies/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/AddOneSpecies/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/AddOneSpecies/Orthogroups_UnassignedGenes.csv diff --git a/Tests/ExpectedOutput/AddOneSpecies/Statistics_Overall.csv b/Tests/ExpectedOutput/AddOneSpecies/Statistics_Overall.csv index 14ad1f7..84042d4 100644 --- a/Tests/ExpectedOutput/AddOneSpecies/Statistics_Overall.csv +++ b/Tests/ExpectedOutput/AddOneSpecies/Statistics_Overall.csv @@ -16,11 +16,11 @@ O50 (all genes) 299 Number of orthogroups with all species present 278 Number of single-copy orthogroups 254 Date 2016-07-15 -Orthogroups file OrthologousGroups.csv -Unassigned genes file OrthologousGroups_UnassignedGenes.csv +Orthogroups file Orthogroups.csv +Unassigned genes file Orthogroups_UnassignedGenes.csv Per-species statistics Statistics_PerSpecies.csv Overall statistics Statistics_Overall.csv -Orthogroups shared between species OrthologousGroups_SpeciesOverlaps.csv +Orthogroups shared between species Orthogroups_SpeciesOverlaps.csv Average number of genes per-species in orthogroup Number of orthogroups Percentage of orthogroups Number of genes Percentage of genes <1 224 41.8 516 26.6 diff --git a/Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups.csv b/Tests/ExpectedOutput/AddTwoSpecies/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups.csv rename to Tests/ExpectedOutput/AddTwoSpecies/Orthogroups.csv diff --git a/Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups.txt b/Tests/ExpectedOutput/AddTwoSpecies/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups.txt rename to Tests/ExpectedOutput/AddTwoSpecies/Orthogroups.txt diff --git a/Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups_SpeciesOverlaps.csv b/Tests/ExpectedOutput/AddTwoSpecies/Orthogroups_SpeciesOverlaps.csv similarity index 100% rename from Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups_SpeciesOverlaps.csv rename to Tests/ExpectedOutput/AddTwoSpecies/Orthogroups_SpeciesOverlaps.csv diff --git a/Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/AddTwoSpecies/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/AddTwoSpecies/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/AddTwoSpecies/Orthogroups_UnassignedGenes.csv diff --git a/Tests/ExpectedOutput/AddTwoSpecies/Statistics_Overall.csv b/Tests/ExpectedOutput/AddTwoSpecies/Statistics_Overall.csv index e462959..8bfd2f0 100644 --- a/Tests/ExpectedOutput/AddTwoSpecies/Statistics_Overall.csv +++ b/Tests/ExpectedOutput/AddTwoSpecies/Statistics_Overall.csv @@ -16,11 +16,11 @@ O50 (all genes) 309 Number of orthogroups with all species present 277 Number of single-copy orthogroups 257 Date 2016-07-15 -Orthogroups file OrthologousGroups.csv -Unassigned genes file OrthologousGroups_UnassignedGenes.csv +Orthogroups file Orthogroups.csv +Unassigned genes file Orthogroups_UnassignedGenes.csv Per-species statistics Statistics_PerSpecies.csv Overall statistics Statistics_Overall.csv -Orthogroups shared between species OrthologousGroups_SpeciesOverlaps.csv +Orthogroups shared between species Orthogroups_SpeciesOverlaps.csv Average number of genes per-species in orthogroup Number of orthogroups Percentage of orthogroups Number of genes Percentage of genes <1 412 56.0 1050 36.5 diff --git a/Tests/ExpectedOutput/RemoveFirstSpecies/OrthologousGroups.csv b/Tests/ExpectedOutput/RemoveFirstSpecies/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/RemoveFirstSpecies/OrthologousGroups.csv rename to Tests/ExpectedOutput/RemoveFirstSpecies/Orthogroups.csv diff --git a/Tests/ExpectedOutput/RemoveFirstSpecies/OrthologousGroups.txt b/Tests/ExpectedOutput/RemoveFirstSpecies/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/RemoveFirstSpecies/OrthologousGroups.txt rename to Tests/ExpectedOutput/RemoveFirstSpecies/Orthogroups.txt diff --git a/Tests/ExpectedOutput/RemoveFirstSpecies/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/RemoveFirstSpecies/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/RemoveFirstSpecies/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/RemoveFirstSpecies/Orthogroups_UnassignedGenes.csv diff --git a/Tests/ExpectedOutput/RemoveLastSpecies/OrthologousGroups.csv b/Tests/ExpectedOutput/RemoveLastSpecies/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/RemoveLastSpecies/OrthologousGroups.csv rename to Tests/ExpectedOutput/RemoveLastSpecies/Orthogroups.csv diff --git a/Tests/ExpectedOutput/RemoveLastSpecies/OrthologousGroups.txt b/Tests/ExpectedOutput/RemoveLastSpecies/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/RemoveLastSpecies/OrthologousGroups.txt rename to Tests/ExpectedOutput/RemoveLastSpecies/Orthogroups.txt diff --git a/Tests/ExpectedOutput/RemoveLastSpecies/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/RemoveLastSpecies/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/RemoveLastSpecies/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/RemoveLastSpecies/Orthogroups_UnassignedGenes.csv diff --git a/Tests/ExpectedOutput/RemoveMiddleSpecies/OrthologousGroups.csv b/Tests/ExpectedOutput/RemoveMiddleSpecies/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/RemoveMiddleSpecies/OrthologousGroups.csv rename to Tests/ExpectedOutput/RemoveMiddleSpecies/Orthogroups.csv diff --git a/Tests/ExpectedOutput/RemoveMiddleSpecies/OrthologousGroups.txt b/Tests/ExpectedOutput/RemoveMiddleSpecies/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/RemoveMiddleSpecies/OrthologousGroups.txt rename to Tests/ExpectedOutput/RemoveMiddleSpecies/Orthogroups.txt diff --git a/Tests/ExpectedOutput/RemoveMiddleSpecies/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/RemoveMiddleSpecies/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/RemoveMiddleSpecies/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/RemoveMiddleSpecies/Orthogroups_UnassignedGenes.csv diff --git a/Tests/ExpectedOutput/SmallExampleDataset/OrthologousGroups.csv b/Tests/ExpectedOutput/SmallExampleDataset/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/SmallExampleDataset/OrthologousGroups.csv rename to Tests/ExpectedOutput/SmallExampleDataset/Orthogroups.csv diff --git a/Tests/ExpectedOutput/SmallExampleDataset/OrthologousGroups.txt b/Tests/ExpectedOutput/SmallExampleDataset/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/SmallExampleDataset/OrthologousGroups.txt rename to Tests/ExpectedOutput/SmallExampleDataset/Orthogroups.txt diff --git a/Tests/ExpectedOutput/SmallExampleDataset/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/SmallExampleDataset/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/SmallExampleDataset/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/SmallExampleDataset/Orthogroups_UnassignedGenes.csv diff --git a/Tests/ExpectedOutput/SmallExampleDataset_I1.8/OrthologousGroups.csv b/Tests/ExpectedOutput/SmallExampleDataset_I1.8/Orthogroups.csv similarity index 100% rename from Tests/ExpectedOutput/SmallExampleDataset_I1.8/OrthologousGroups.csv rename to Tests/ExpectedOutput/SmallExampleDataset_I1.8/Orthogroups.csv diff --git a/Tests/ExpectedOutput/SmallExampleDataset_I1.8/OrthologousGroups.txt b/Tests/ExpectedOutput/SmallExampleDataset_I1.8/Orthogroups.txt similarity index 100% rename from Tests/ExpectedOutput/SmallExampleDataset_I1.8/OrthologousGroups.txt rename to Tests/ExpectedOutput/SmallExampleDataset_I1.8/Orthogroups.txt diff --git a/Tests/ExpectedOutput/SmallExampleDataset_I1.8/OrthologousGroups_UnassignedGenes.csv b/Tests/ExpectedOutput/SmallExampleDataset_I1.8/Orthogroups_UnassignedGenes.csv similarity index 100% rename from Tests/ExpectedOutput/SmallExampleDataset_I1.8/OrthologousGroups_UnassignedGenes.csv rename to Tests/ExpectedOutput/SmallExampleDataset_I1.8/Orthogroups_UnassignedGenes.csv diff --git a/Tests/test_orthofinder.py b/Tests/test_orthofinder.py index 66b13ae..1f95345 100755 --- a/Tests/test_orthofinder.py +++ b/Tests/test_orthofinder.py @@ -33,7 +33,7 @@ exampleBlastDir = baseDir + "Input/SmallExampleDataset_ExampleBlastDir/" goldResultsDir_smallExample = baseDir + "ExpectedOutput/SmallExampleDataset/" goldPrepareBlastDir = baseDir + "ExpectedOutput/SmallExampleDataset_PreparedForBlast/" -version = "1.0.1" +version = "1.0.2" requiredBlastVersion = "2.2.28+" citation = """When publishing work that uses OrthoFinder please cite: @@ -179,7 +179,7 @@ class TestCommandLine(unittest.TestCase): # @unittest.skipIf(__skipLongTests__, "Only performing quick tests") def test_fromfasta(self): currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") - expectedCSVFile = currentResultsDir + "OrthologousGroups.csv" + expectedCSVFile = currentResultsDir + "Orthogroups.csv" with CleanUp([], [], [currentResultsDir, ]): self.stdout, self.stderr = self.RunOrthoFinder("-f %s -g" % exampleFastaDir) self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) @@ -187,7 +187,7 @@ class TestCommandLine(unittest.TestCase): def test_fromfasta_threads(self): currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") - expectedCSVFile = currentResultsDir + "OrthologousGroups.csv" + expectedCSVFile = currentResultsDir + "Orthogroups.csv" with CleanUp([], [], [currentResultsDir, ]): self.stdout, self.stderr = self.RunOrthoFinder("-f %s -t 4 -a 3 -g" % exampleFastaDir) self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) @@ -196,7 +196,7 @@ class TestCommandLine(unittest.TestCase): @unittest.skipIf(__skipLongTests__, "Only performing quick tests") def test_fromfasta_full(self): currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") - expectedCSVFile = currentResultsDir + "OrthologousGroups.csv" + expectedCSVFile = currentResultsDir + "Orthogroups.csv" with CleanUp([], [], [currentResultsDir, ]): self.stdout, self.stderr = self.RunOrthoFinder("--fasta %s -g" % exampleFastaDir) self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) @@ -245,8 +245,8 @@ class TestCommandLine(unittest.TestCase): self.test_passed = True def test_fromblast(self): - expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" - newFiles = ("OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv" % (version, version, version)).split() + expectedCSVFile = exampleBlastDir + "Orthogroups.csv" + newFiles = ("Orthogroups.csv Orthogroups_UnassignedGenes.csv Orthogroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): self.stdout, self.stderr = self.RunOrthoFinder("-b %s -g" % exampleBlastDir) @@ -254,8 +254,8 @@ class TestCommandLine(unittest.TestCase): self.test_passed = True def test_fromblast_full(self): - expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" - newFiles = ("OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv" % (version, version, version)).split() + expectedCSVFile = exampleBlastDir + "Orthogroups.csv" + newFiles = ("Orthogroups.csv Orthogroups_UnassignedGenes.csv Orthogroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): self.stdout, self.stderr = self.RunOrthoFinder("--blast %s -g" % exampleBlastDir) @@ -263,8 +263,8 @@ class TestCommandLine(unittest.TestCase): self.test_passed = True def test_fromblast_algthreads(self): - expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" - newFiles = ("Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split() + expectedCSVFile = exampleBlastDir + "Orthogroups.csv" + newFiles = ("Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv Orthogroups.csv Orthogroups_UnassignedGenes.csv Orthogroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): self.stdout, self.stderr = self.RunOrthoFinder("-b %s -a 3 -g" % exampleBlastDir) @@ -284,8 +284,8 @@ class TestCommandLine(unittest.TestCase): self.test_passed = True def test_inflation(self): - expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" - newFiles = ("Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.8.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.8.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split() + expectedCSVFile = exampleBlastDir + "Orthogroups.csv" + newFiles = ("Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv Orthogroups.csv Orthogroups_UnassignedGenes.csv Orthogroups.txt clusters_OrthoFinder_v%s_I1.8.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.8.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): self.stdout, self.stderr = self.RunOrthoFinder("-I 1.8 -b %s -g" % exampleBlastDir) @@ -295,7 +295,7 @@ class TestCommandLine(unittest.TestCase): # @unittest.skipIf(__skipLongTests__, "Only performing quick tests") # def test_fromblastOrthobench(self): # goldResultsDir_orthobench = baseDir + "ExpectedOutput/Orthobench_blast/" -# expectedCSVFileLocation = baseDir + "Input/Orthobench_blast/OrthologousGroups.csv" +# expectedCSVFileLocation = baseDir + "Input/Orthobench_blast/Orthogroups.csv" # self.currentResultsDir = None # expectedNewFiles = [baseDir + "Input/Orthobench_blast/" + x for x in "OrthoFinder_v0.4.0_graph.txt clusters_OrthoFinder_v0.4.0_I1.5.txt clusters_OrthoFinder_v0.4.0_I1.5.txt_id_pairs.txt".split()] # with CleanUp(expectedNewFiles, []): @@ -329,8 +329,8 @@ class TestCommandLine(unittest.TestCase): def test_addOneSpecies(self): expectedExtraFiles = [exampleBlastDir + fn for fn in ("Blast0_3.txt Blast3_0.txt Blast1_3.txt Blast3_1.txt Blast2_3.txt Blast3_2.txt Blast3_3.txt Species3.fa \ - OrthologousGroups.csv OrthologousGroups.txt OrthologousGroups_UnassignedGenes.csv \ - Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv \ + Orthogroups.csv Orthogroups.txt Orthogroups_UnassignedGenes.csv \ + Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv \ clusters_OrthoFinder_v%s_I1.5.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split()] expectedChangedFiles = [exampleBlastDir + fn for fn in "SpeciesIDs.txt SequenceIDs.txt".split()] # cleanup afterwards including failed test @@ -351,8 +351,8 @@ class TestCommandLine(unittest.TestCase): def test_addTwoSpecies(self): expectedExtraFiles = [exampleBlastDir + fn for fn in ("Blast0_3.txt Blast3_0.txt Blast1_3.txt Blast3_1.txt Blast2_3.txt Blast3_2.txt Blast3_3.txt Species3.fa \ Blast0_4.txt Blast4_0.txt Blast1_4.txt Blast4_1.txt Blast2_4.txt Blast4_2.txt Blast3_4.txt Blast4_3.txt Blast4_4.txt Species4.fa \ - OrthologousGroups.csv OrthologousGroups.txt OrthologousGroups_UnassignedGenes.csv \ - Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv \ + Orthogroups.csv Orthogroups.txt Orthogroups_UnassignedGenes.csv \ + Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv \ clusters_OrthoFinder_v%s_I1.5.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split()] expectedChangedFiles = [exampleBlastDir + fn for fn in "SpeciesIDs.txt SequenceIDs.txt".split()] goldDir = baseDir + "ExpectedOutput/AddTwoSpecies/" @@ -399,8 +399,8 @@ class TestCommandLine(unittest.TestCase): def RemoveSpeciesTest(self, inputDir, goldDir): """Working directory and results directory with correct files in""" - requiredResults = [inputDir + fn for fn in "OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt".split()] - expectedExtraFiles = [inputDir + fn for fn in ("Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv clusters_OrthoFinder_v%s_I1.5.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split()] + requiredResults = [inputDir + fn for fn in "Orthogroups.csv Orthogroups_UnassignedGenes.csv Orthogroups.txt".split()] + expectedExtraFiles = [inputDir + fn for fn in ("Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv clusters_OrthoFinder_v%s_I1.5.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split()] with CleanUp(expectedExtraFiles + requiredResults, []): self.stdout, self.stderr = self.RunOrthoFinder("-b %s" % inputDir) for fn in requiredResults: @@ -414,8 +414,8 @@ class TestCommandLine(unittest.TestCase): def test_removeOneAddOne(self): inputDir = baseDir + "Input/ExampleDataset_addOneRemoveOne/Results_Jan28/WorkingDirectory/" expectedExtraFiles = [inputDir + fn for fn in ("Blast0_3.txt Blast3_0.txt Blast1_3.txt Blast3_1.txt Blast2_3.txt Blast3_2.txt Blast3_3.txt Species3.fa \ - OrthologousGroups.csv OrthologousGroups.txt OrthologousGroups_UnassignedGenes.csv \ - Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv \ + Orthogroups.csv Orthogroups.txt Orthogroups_UnassignedGenes.csv \ + Statistics_PerSpecies.csv Statistics_Overall.csv Orthogroups_SpeciesOverlaps.csv \ clusters_OrthoFinder_v%s_I1.5.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split()] expectedChangedFiles = [inputDir + fn for fn in "SpeciesIDs.txt SequenceIDs.txt".split()] goldDir = baseDir + "ExpectedOutput/AddOneRemoveOne/" @@ -426,7 +426,7 @@ class TestCommandLine(unittest.TestCase): for fn in expectedExtraFiles: os.path.split(fn)[1] self.assertTrue(os.path.exists(fn), msg=fn) - if "OrthologousGroups" in os.path.split(fn)[1]: + if "Orthogroups" in os.path.split(fn)[1]: self.CompareFile(goldDir + os.path.split(fn)[1], fn) self.CompareFile(goldDir + "SpeciesIDs.txt", inputDir + "SpeciesIDs.txt") self.CompareFile(goldDir + "SequenceIDs.txt", inputDir + "SequenceIDs.txt") @@ -568,7 +568,7 @@ class TestCommandLine(unittest.TestCase): def tearDown(self): self.CleanCurrentResultsDir() if not self.test_passed: - print(self.stdout) +# print(self.stdout) print(self.stderr) def RunOrthoFinder(self, commands): @@ -601,9 +601,9 @@ class TestCommandLine(unittest.TestCase): # Results - orthogroups correct resultsDir = os.path.split(expectedCSVFileLocation)[0] + "/" - self.CompareFile(goldResultsDir + "OrthologousGroups.csv", resultsDir + "OrthologousGroups.csv") - self.CompareFile(goldResultsDir + "OrthologousGroups_UnassignedGenes.csv", resultsDir + "OrthologousGroups_UnassignedGenes.csv") - self.CompareFile(goldResultsDir + "OrthologousGroups.txt", resultsDir + "OrthologousGroups.txt") + self.CompareFile(goldResultsDir + "Orthogroups.csv", resultsDir + "Orthogroups.csv") + self.CompareFile(goldResultsDir + "Orthogroups_UnassignedGenes.csv", resultsDir + "Orthogroups_UnassignedGenes.csv") + self.CompareFile(goldResultsDir + "Orthogroups.txt", resultsDir + "Orthogroups.txt") def CleanCurrentResultsDir(self): if self.currentResultsDir == None: return diff --git a/orthofinder/ExampleDataset/Mycoplasma_agalactiae_5632_FP671138.faa b/orthofinder/ExampleDataset/Mycoplasma_agalactiae.faa similarity index 100% rename from orthofinder/ExampleDataset/Mycoplasma_agalactiae_5632_FP671138.faa rename to orthofinder/ExampleDataset/Mycoplasma_agalactiae.faa diff --git a/orthofinder/ExampleDataset/Mycoplasma_gallisepticum_uid409_AE015450.faa b/orthofinder/ExampleDataset/Mycoplasma_gallisepticum.faa similarity index 100% rename from orthofinder/ExampleDataset/Mycoplasma_gallisepticum_uid409_AE015450.faa rename to orthofinder/ExampleDataset/Mycoplasma_gallisepticum.faa diff --git a/orthofinder/ExampleDataset/Mycoplasma_genitalium_uid97_L43967.faa b/orthofinder/ExampleDataset/Mycoplasma_genitalium.faa similarity index 100% rename from orthofinder/ExampleDataset/Mycoplasma_genitalium_uid97_L43967.faa rename to orthofinder/ExampleDataset/Mycoplasma_genitalium.faa diff --git a/orthofinder/ExampleDataset/Mycoplasma_hyopneumoniae_AE017243.faa b/orthofinder/ExampleDataset/Mycoplasma_hyopneumoniae.faa similarity index 100% rename from orthofinder/ExampleDataset/Mycoplasma_hyopneumoniae_AE017243.faa rename to orthofinder/ExampleDataset/Mycoplasma_hyopneumoniae.faa diff --git a/orthofinder/orthofinder.py b/orthofinder/orthofinder.py index 1aab96e..9741b72 100755 --- a/orthofinder/orthofinder.py +++ b/orthofinder/orthofinder.py @@ -153,7 +153,7 @@ class MCL: with open(orthoxmlFilename, 'wb') as orthoxmlFile: # ET.ElementTree(root).write(orthoxmlFile) orthoxmlFile.write(MCL.prettify(root)) - print("Orthologous groups have been written to orthoxml file:\n %s" % orthoxmlFilename) + print("Orthogroups have been written to orthoxml file:\n %s" % orthoxmlFilename) @staticmethod def RunMCL(graphFilename, clustersFilename, nProcesses, inflation): @@ -232,8 +232,7 @@ class MCL: for iSpecies in xrange(nSpecies): row.append(", ".join(sorted(ogDict[iSpecies]))) thisOutputWriter.writerow(row) - resultsFilesString = "Orthologous groups have been written to tab-delimited files:\n %s\n %s\n" % (outputFilename, singleGeneFilename) - resultsFilesString += "And in OrthoMCL format:\n %s" % (outputFilename[:-3] + "txt") + resultsFilesString = "Orthogroups have been written to tab-delimited files:\n %s\n %s (OrthoMCL format)\n %s" % (outputFilename, outputFilename[:-3] + "txt", singleGeneFilename) return resultsFilesString """ @@ -444,7 +443,6 @@ class WaterfallMethod: def ProcessBlastHits(seqsInfo, fileInfo, Lengths, iSpecies): with warnings.catch_warnings(): warnings.simplefilter("ignore") - util.PrintTime("Starting species %d" % iSpecies) # process up to the best hits for each species Bi = [] for jSpecies in xrange(seqsInfo.nSpecies): @@ -536,7 +534,7 @@ class WaterfallMethod: Stats ------------------------------------------------------------------------------- """ -def OrthologousGroupsMatrix(iSpecies, properOGs): +def OrthogroupsMatrix(iSpecies, properOGs): speciesIndexDict = {iSp:iCol for iCol, iSp in enumerate(iSpecies)} nSpecies = len(iSpecies) nGroups = len(properOGs) @@ -612,7 +610,7 @@ def Stats(ogs, speciesNamesDict, iSpecies, resultsDir, iResultsVersion): allGenes = [g for og in allOgs for g in og] filename_sp = resultsDir + "Statistics_PerSpecies" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".csv" filename_sum = resultsDir + "Statistics_Overall" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".csv" - filename_overlap = resultsDir + "OrthologousGroups_SpeciesOverlaps" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".csv" + filename_overlap = resultsDir + "Orthogroups_SpeciesOverlaps" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".csv" percentFormat = "%0.1f" with open(filename_sp, 'wb') as outfile_species, open(filename_sum, 'wb') as outfile_sum: writer_sp = csv.writer(outfile_species, delimiter="\t") @@ -677,7 +675,7 @@ def Stats(ogs, speciesNamesDict, iSpecies, resultsDir, iResultsVersion): writer_sum.writerow(["O50 (all genes)", O50]) # Single-copy orthogroups - ogMatrix = OrthologousGroupsMatrix(iSpecies, properOGs) + ogMatrix = OrthogroupsMatrix(iSpecies, properOGs) nSpecies = len(iSpecies) nPresent = (ogMatrix > np.zeros((1, nSpecies))).sum(1) nCompleteOGs = list(nPresent).count(nSpecies) @@ -688,8 +686,8 @@ def Stats(ogs, speciesNamesDict, iSpecies, resultsDir, iResultsVersion): # Results filenames writer_sum.writerow(["Date", str(datetime.datetime.now()).split()[0]]) - writer_sum.writerow(["Orthogroups file", "OrthologousGroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".csv"]) - writer_sum.writerow(["Unassigned genes file", "OrthologousGroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + "_UnassignedGenes.csv"]) + writer_sum.writerow(["Orthogroups file", "Orthogroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + ".csv"]) + writer_sum.writerow(["Unassigned genes file", "Orthogroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + "_UnassignedGenes.csv"]) writer_sum.writerow(["Per-species statistics", os.path.split(filename_sp)[1]]) writer_sum.writerow(["Overall statistics", os.path.split(filename_sum)[1]]) writer_sum.writerow(["Orthogroups shared between species", os.path.split(filename_overlap)[1]]) @@ -699,7 +697,7 @@ def Stats(ogs, speciesNamesDict, iSpecies, resultsDir, iResultsVersion): Stats_SpeciesOverlaps(filename_overlap, speciesNamesDict, iSpecies, speciesPresence) statsFiles = "Orthogroup statistics:\n" - statsFiles += " " + " ".join([os.path.split(fn)[1] for fn in [filename_sp, filename_sum, filename_overlap]]) + "\n" + statsFiles += " " + " ".join([os.path.split(fn)[1] for fn in [filename_sp, filename_sum, filename_overlap]]) summaryText = """OrthoFinder assigned %d genes (%0.1f%% of total) to %d orthogroups. Fifty percent of all genes were in orthogroups with %d or more genes (G50 was %d) and were contained in the largest %d orthogroups (O50 was %d). There were %d orthogroups with all species present and %d of these consisted entirely of single-copy genes.""" % (nAssigned, pAssigned, nOgs, G50, G50, O50, O50, nCompleteOGs, nSingleCopy) @@ -1011,10 +1009,8 @@ if __name__ == "__main__": if resultsDir == None: resultsDir = util.CreateNewWorkingDirectory(fastaDir + "Results_") workingDir = resultsDir + "WorkingDirectory" + os.sep os.mkdir(workingDir) - if qUsePrecalculatedBlast: - print("%d thread(s) for BLAST searches" % nBlast) - if not qOnlyPrepare: - print("%d thread(s) for OrthoFinder algorithm" % nProcessAlg) + print("%d thread(s) for highly parallel tasks (BLAST searches etc.)" % nBlast) + print("%d thread(s) for OrthoFinder algorithm" % nProcessAlg) # check for BLAST+ and MCL - else instruct how to install and add to path print("\n1. Checking required programs are installed") @@ -1032,7 +1028,6 @@ if __name__ == "__main__": else: newFastaFiles, userFastaFilenames, idsFilename, speciesIdsFilename, newSpeciesIDs, previousSpeciesIDs = AssignIDsToSequences(fastaDir, workingDir) speciesToUse = speciesToUse + newSpeciesIDs - print("Done!") seqsInfo = util.GetSeqsInfo(workingDir_previous if qUsePrecalculatedBlast else workingDir, speciesToUse) if qXML: @@ -1093,7 +1088,6 @@ if __name__ == "__main__": command = ["makeblastdb", "-dbtype", "prot", "-in", workingDir + "Species%d.fa" % iSp, "-out", workingDir + "BlastDBSpecies%d" % iSp] util.PrintTime("Creating Blast database %d of %d" % (iSp + 1, nDB)) RunBlastDBCommand(command) - print("Done!") if qOnlyPrepare: print("\n4. BLAST commands that must be run") @@ -1111,7 +1105,7 @@ if __name__ == "__main__": for command in commands: print(" ".join(command)) sys.exit() - print("Maximum number of BLAST processes: %d" % nBlast) + print("Using %d thread(s)" % nBlast) util.PrintTime("This may take some time....") cmd_queue = mp.Queue() for iCmd, cmd in enumerate(commands): @@ -1169,12 +1163,12 @@ if __name__ == "__main__": clustersFilename_pairs = clustersFilename + "_id_pairs.txt" MCLread.ConvertSingleIDsToIDPair(seqsInfo, clustersFilename, clustersFilename_pairs) - print("\n6. Creating files for Orthologous Groups") - print( "----------------------------------------") + print("\n6. Writing orthogroups to file") + print( "------------------------------") if not qOrthologues: util.PrintCitation() ogs = MCLread.GetPredictedOGs(clustersFilename_pairs) - resultsBaseFilename = util.GetUnusedFilename(resultsDir + "OrthologousGroups", ".csv")[:-4] # remove .csv from base filename - resultsBaseFilename = resultsDir + "OrthologousGroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) + resultsBaseFilename = util.GetUnusedFilename(resultsDir + "Orthogroups", ".csv")[:-4] # remove .csv from base filename + resultsBaseFilename = resultsDir + "Orthogroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) idsDict = MCL.WriteOrthogroupFiles(ogs, [idsFilename], resultsBaseFilename, clustersFilename_pairs) speciesNamesDict = SpeciesNameDict(speciesIdsFilename) orthogroupsResultsFilesString = MCL.CreateOrthogroupTable(ogs, idsDict, speciesNamesDict, speciesToUse, resultsBaseFilename) @@ -1192,6 +1186,7 @@ if __name__ == "__main__": orthologuesResultsFilesString = get_orthologues.GetOrthologues(workingDir, resultsDir, clustersFilename_pairs, nBlast) print(orthogroupsResultsFilesString) print(orthologuesResultsFilesString.rstrip()) + print("") print(statsFile) print("") print(summaryText) diff --git a/orthofinder/scripts/get_orthologues.py b/orthofinder/scripts/get_orthologues.py index 478c25e..310033f 100755 --- a/orthofinder/scripts/get_orthologues.py +++ b/orthofinder/scripts/get_orthologues.py @@ -28,6 +28,7 @@ import os import sys import glob +import shutil import subprocess import numpy as np from collections import Counter, defaultdict @@ -222,7 +223,7 @@ class DendroBLASTTrees(object): self.ogSet = ogSet self.nProcesses = nProcesses self.species = sorted(map(int, self.ogSet.SpeciesDict().keys())) - treesDir = outD + "Trees/" + treesDir = outD + "Gene_Trees/" self.workingDir = outD + "WorkingDirectory/" treesIDsDir = self.workingDir + "Trees_ids/" distancesDir = self.workingDir + "Distances/" @@ -280,8 +281,8 @@ class DendroBLASTTrees(object): m[i, j] = 0.5*max(B[gi.iSeq, gj.iSeq], mins[gi.iSeq]) / maxes[gi.iSeq] return ogs, ogMatrices - def DeleteBlastMatrices(self, workingDir): - for f in glob.glob(workingDir + "B*_*.pic"): + def DeleteBlastMatrices(self): + for f in glob.glob(self.ogSet.fileInfo.outputDir + "Bit*_*.pic"): if os.path.exists(f): os.remove(f) def WriteOGMatrices(self, ogs, ogMatrices): @@ -364,7 +365,7 @@ class DendroBLASTTrees(object): t.write(outfile = newTreeFilename, format=4) except: pass - + def RunAnalysis(self): ogs, ogMatrices_partial = self.GetOGMatrices() ogMatrices = self.WriteOGMatrices(ogs, ogMatrices_partial) @@ -517,11 +518,11 @@ def PrintHelp(): def GetResultsFilesString(rootedSpeciesTreeFN): st = "" baseResultsDir = os.path.abspath(os.path.split(rootedSpeciesTreeFN[0])[0] + "./../Trees/") - st += "Gene trees:\n %s\n" % baseResultsDir + st += "\nGene trees:\n %s\n" % baseResultsDir if len(rootedSpeciesTreeFN) == 1: resultsDir = os.path.split(rootedSpeciesTreeFN[0])[0] - st += "Rooted species tree:\n %s\n" % rootedSpeciesTreeFN[0] - st += "Species-by-species orthologues:\n %s\n" % resultsDir + st += "\nRooted species tree:\n %s\n" % rootedSpeciesTreeFN[0] + st += "\nSpecies-by-species orthologues:\n %s\n" % resultsDir else: st += "\nMultiple potential outgroups were identified for the species tree. Each case has been analysed separately.\n" st+= "Please review the rooted species trees and use the results corresponding to the correct one.\n\n" @@ -532,6 +533,17 @@ def GetResultsFilesString(rootedSpeciesTreeFN): return st +def CleanWorkingDir(dendroBlast): + dendroBlast.DeleteBlastMatrices() + dirs = ['Distances/', "matrices_orthologues/", "Trees_ids_arbitraryRoot/", "SpeciesTree_unrooted.txt"] + for d in dirs: + dFull = dendroBlast.workingDir + d + if os.path.exists(dFull): + if dFull[-1] == "/": + shutil.rmtree(dFull) + else: + os.remove(dFull) + def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs, nProcesses): ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = util.FirstWordExtractor) if len(ogSet.speciesToUse) < 4: @@ -545,8 +557,8 @@ def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilenam sys.exit() - print("\n2. Reading sequence similarity scores") - print( "-------------------------------------") + print("\n2. Calculating gene distances") + print( "-----------------------------") resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) @@ -570,16 +582,17 @@ def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilenam resultsSpeciesTrees = [] for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): if qMultiple: - resultsDir_new = resultsDir + "Orthologues_for_potential_outgroup_%d/" % i + resultsDir_new = resultsDir + "Orthologues_using_outgroup_%d/" % i + resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted_at_outgroup_%d.txt" % i) else: resultsDir_new = resultsDir + "Orthologues/" + resultsSpeciesTrees.append(resultsDir + "SpeciesTree_rooted.txt") os.mkdir(resultsDir_new) - resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted.txt") db.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) print("\n5%s. Reconciling gene and species trees" % ("-%d"%i if qMultiple else "")) print( "-------------------------------------" + ("--" if qMultiple else "")) - print("Root: " + (", ".join([spDict[s] for s in r]))) + print("Outgroup: " + (", ".join([spDict[s] for s in r]))) dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) # Orthologue lists @@ -587,11 +600,12 @@ def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilenam print( "----------------------------------------" + ("--" if qMultiple else "")) pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) + CleanWorkingDir(db) print("\n7. Writing results files") print( "------------------------") return GetResultsFilesString(resultsSpeciesTrees) - + if __name__ == "__main__": if len(sys.argv) < 2 or sys.argv[1] == "--help" or sys.argv[1] == "-h": PrintHelp() diff --git a/orthofinder/scripts/util.py b/orthofinder/scripts/util.py index cd8a1f3..e800156 100644 --- a/orthofinder/scripts/util.py +++ b/orthofinder/scripts/util.py @@ -46,7 +46,7 @@ SequencesInfo = namedtuple("SequencesInfo", "nSeqs nSpecies speciesToUse seqStar FileInfo = namedtuple("FileInfo", "inputDir outputDir graphFilename") picProtocol = 1 -version = "1.0.1" +version = "1.0.2" def PrintNoNewLine(text): sys.stdout.write(text) @@ -354,17 +354,19 @@ def GetOGsFile(userArg): else: # identify orthogroups file clustersFiles = glob.glob(orthofinderWorkingDir + "clusters_OrthoFinder_*.txt_id_pairs.txt") - orthogroupFiles = glob.glob(orthofinderWorkingDir + "OrthologousGroups*.txt") + orthogroupFiles = glob.glob(orthofinderWorkingDir + "OrthologousGroups*.txt") + glob.glob(orthofinderWorkingDir + "Orthogroups*.txt") if orthofinderWorkingDir != userArg: orthogroupFiles += glob.glob(userArg + "OrthologousGroups*.txt") + orthogroupFiles += glob.glob(userArg + "Orthogroups*.txt") # User may have specified a WorkingDirectory and results could be in directory above if len(orthogroupFiles) < len(clustersFiles): orthogroupFiles += glob.glob(userArg + ".." + os.sep + "OrthologousGroups*.txt") + orthogroupFiles += glob.glob(userArg + ".." + os.sep + "Orthogroups*.txt") clustersFiles = sorted(clustersFiles) orthogroupFiles = sorted(orthogroupFiles) if len(clustersFiles) > 1 or len(orthogroupFiles) > 1: print("ERROR: Results from multiple OrthoFinder runs found\n") - print("Tab-delimiter OrthologousGroups*.txt files:") + print("Tab-delimiter Orthogroups*.txt/OrthologousGroups*.txt files:") for fn in orthogroupFiles: print(" " + fn) print("With corresponding cluster files:") @@ -375,7 +377,7 @@ def GetOGsFile(userArg): if len(clustersFiles) != 1 or len(orthogroupFiles) != 1: print("ERROR: Results not found in <orthofinder_results_directory> or <orthofinder_results_directory>/WorkingDirectory") - print("\nCould not find:\n OrthologousGroups*.txt\nor\n clusters_OrthoFinder_*.txt_id_pairs.txt") + print("\nCould not find:\n Orthogroups*.txt/OrthologousGroups*.txt\nor\n clusters_OrthoFinder_*.txt_id_pairs.txt") Fail() print("Generating trees for orthogroups in file:\n %s" % orthogroupFiles[0]) -- GitLab