diff --git a/Tests/test_orthofinder.py b/Tests/test_orthofinder.py index bdc96f77fb4563b10759adc3bd1fea26c905f920..a2b9b27179773329e4a70a8ec6790ce9576680a7 100755 --- a/Tests/test_orthofinder.py +++ b/Tests/test_orthofinder.py @@ -9,6 +9,7 @@ Created on Wed Dec 16 11:25:10 2015 import os import sys import time +import types import datetime import unittest import subprocess @@ -22,17 +23,17 @@ __skipLongTests__ = False baseDir = os.path.dirname(os.path.realpath(__file__)) + os.sep qBinary = False -orthofinder = baseDir + "../orthofinder.py" -orthofinder_bin = baseDir + "../bin/orthofinder" -trees_for_orthogroups = baseDir + "../trees_for_orthogroups.py" -trees_for_orthogroups_bin = baseDir + "../bin/trees_for_orthogroups" +orthofinder = baseDir + "../orthofinder/orthofinder.py" +orthofinder_bin = baseDir + "../orthofinder/bin/orthofinder" +trees_for_orthogroups = baseDir + "../orthofinder/trees_from_MSA.py" +trees_for_orthogroups_bin = baseDir + "../orthofinder/bin/trees_from_MSA" exampleFastaDir = baseDir + "Input/SmallExampleDataset/" exampleBlastDir = baseDir + "Input/SmallExampleDataset_ExampleBlastDir/" goldResultsDir_smallExample = baseDir + "ExpectedOutput/SmallExampleDataset/" goldPrepareBlastDir = baseDir + "ExpectedOutput/SmallExampleDataset_PreparedForBlast/" -version = "1.0.0" +version = "1.0.1" requiredBlastVersion = "2.2.28+" citation = """When publishing work that uses OrthoFinder please cite: @@ -92,6 +93,9 @@ Arguments limiting factor above about 5-10 threads and additional threads may have little effect other than increase RAM requirements. [Default is 1] +-g, --groups + Only infer orthogroups, do not infer gene trees of orthologues. + -I inflation_parameter, --inflation inflation_parameter Specify a non-default inflation parameter for MCL. [Default is 1.5] @@ -126,6 +130,7 @@ class CleanUp(object): self.newFiles = newFiles self.modifiedFiles = modifiedFiles self.copies = [] + assert(types.ListType == type(newDirs)) # if it were a string the code could attempt to delete every file on computer self.newDirs = newDirs self.qSaveFiles = qSaveFiles def __enter__(self): @@ -176,27 +181,30 @@ class TestCommandLine(unittest.TestCase): currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") expectedCSVFile = currentResultsDir + "OrthologousGroups.csv" with CleanUp([], [], [currentResultsDir, ]): - stdout, stderr = self.RunOrthoFinder("-f %s" % exampleFastaDir) - self.CheckStandardRun(stdout, stderr, goldResultsDir_smallExample, expectedCSVFile) + self.stdout, self.stderr = self.RunOrthoFinder("-f %s -g" % exampleFastaDir) + self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) + self.test_passed = True def test_fromfasta_threads(self): currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") expectedCSVFile = currentResultsDir + "OrthologousGroups.csv" with CleanUp([], [], [currentResultsDir, ]): - stdout, stderr = self.RunOrthoFinder("-f %s -t 4 -a 3" % exampleFastaDir) - self.CheckStandardRun(stdout, stderr, goldResultsDir_smallExample, expectedCSVFile) + self.stdout, self.stderr = self.RunOrthoFinder("-f %s -t 4 -a 3 -g" % exampleFastaDir) + self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) + self.test_passed = True @unittest.skipIf(__skipLongTests__, "Only performing quick tests") def test_fromfasta_full(self): currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") expectedCSVFile = currentResultsDir + "OrthologousGroups.csv" with CleanUp([], [], [currentResultsDir, ]): - stdout, stderr = self.RunOrthoFinder("--fasta %s" % exampleFastaDir) - self.CheckStandardRun(stdout, stderr, goldResultsDir_smallExample, expectedCSVFile) + self.stdout, self.stderr = self.RunOrthoFinder("--fasta %s -g" % exampleFastaDir) + self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) + self.test_passed = True def test_justPrepare(self): self.currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") - stdout, stderr = self.RunOrthoFinder("-f %s -p" % exampleFastaDir) + self.stdout, self.stderr = self.RunOrthoFinder("-f %s -p" % exampleFastaDir) expectedFiles = ['BlastDBSpecies0.phr', 'BlastDBSpecies0.pin', 'BlastDBSpecies0.psq', 'BlastDBSpecies1.phr', 'BlastDBSpecies1.pin', 'BlastDBSpecies1.psq', 'BlastDBSpecies2.phr', 'BlastDBSpecies2.pin', 'BlastDBSpecies2.psq', @@ -213,10 +221,11 @@ class TestCommandLine(unittest.TestCase): # no other files in directory or intermediate directory self.assertTrue(len(list(glob.glob(self.currentResultsDir + "WorkingDirectory/*"))), len(expectedFiles)) self.assertTrue(len(list(glob.glob(self.currentResultsDir + "*"))), 1) # Only 'WorkingDirectory/" + self.test_passed = True def test_justPrepareFull(self): self.currentResultsDir = exampleFastaDir + "Results_%s/" % datetime.date.today().strftime("%b%d") - stdout, stderr = self.RunOrthoFinder("-f %s --prepare" % exampleFastaDir) + self.stdout, self.stderr = self.RunOrthoFinder("-f %s --prepare" % exampleFastaDir) expectedFiles = ['BlastDBSpecies0.phr', 'BlastDBSpecies0.pin', 'BlastDBSpecies0.psq', 'BlastDBSpecies1.phr', 'BlastDBSpecies1.pin', 'BlastDBSpecies1.psq', 'BlastDBSpecies2.phr', 'BlastDBSpecies2.pin', 'BlastDBSpecies2.psq', @@ -232,48 +241,56 @@ class TestCommandLine(unittest.TestCase): # no other files in directory or intermediate directory self.assertTrue(len(list(glob.glob(self.currentResultsDir + "WorkingDirectory/*"))), len(expectedFiles)) - self.assertTrue(len(list(glob.glob(self.currentResultsDir + "*"))), 1) # Only 'WorkingDirectory/" + self.assertTrue(len(list(glob.glob(self.currentResultsDir + "*"))), 1) # Only 'WorkingDirectory/" + self.test_passed = True def test_fromblast(self): expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" newFiles = ("OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): - stdout, stderr = self.RunOrthoFinder("-b %s" % exampleBlastDir) - self.CheckStandardRun(stdout, stderr, goldResultsDir_smallExample, expectedCSVFile) + self.stdout, self.stderr = self.RunOrthoFinder("-b %s -g" % exampleBlastDir) + self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) + self.test_passed = True def test_fromblast_full(self): expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" newFiles = ("OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): - stdout, stderr = self.RunOrthoFinder("--blast %s" % exampleBlastDir) - self.CheckStandardRun(stdout, stderr, goldResultsDir_smallExample, expectedCSVFile) + self.stdout, self.stderr = self.RunOrthoFinder("--blast %s -g" % exampleBlastDir) + self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) + self.test_passed = True def test_fromblast_algthreads(self): expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" newFiles = ("Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.5.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): - stdout, stderr = self.RunOrthoFinder("-b %s -a 3" % exampleBlastDir) - self.CheckStandardRun(stdout, stderr, goldResultsDir_smallExample, expectedCSVFile) + self.stdout, self.stderr = self.RunOrthoFinder("-b %s -a 3 -g" % exampleBlastDir) + self.CheckStandardRun(self.stdout, self.stderr, goldResultsDir_smallExample, expectedCSVFile) + self.test_passed = True def test_blast_results_error(self): - with CleanUp([], []): - stdout, stderr = self.RunOrthoFinder("-a 2 -b " + baseDir + "Input/SmallExampleDataset_ExampleBadBlast/") - self.assertTrue("Traceback" not in stderr) - self.assertTrue("Offending line was:" in stderr) - self.assertTrue("0_0 0_0 100.00 466" in stderr) - self.assertTrue("Connected putatitive homologs" not in stdout) - self.assertTrue("ERROR: An error occurred, please review previous error messages for more information." in stdout) + d = baseDir + "Input/SmallExampleDataset_ExampleBadBlast/" + newFiles = [d + "%s%d_%d.pic" % (s, i,j) for i in xrange(1, 3) for j in xrange(3) for s in ["B", "BH"]] + with CleanUp(newFiles, []): + self.stdout, self.stderr = self.RunOrthoFinder("-a 2 -g -b " + d) + self.assertTrue("Traceback" not in self.stderr) + self.assertTrue("Offending line was:" in self.stderr) + self.assertTrue("0_0 0_0 100.00 466" in self.stderr) + self.assertTrue("Connected putatitive homologs" not in self.stdout) + self.assertTrue("ERROR: An error occurred, please review previous error messages for more information." in self.stdout) + self.test_passed = True def test_inflation(self): expectedCSVFile = exampleBlastDir + "OrthologousGroups.csv" newFiles = ("Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt clusters_OrthoFinder_v%s_I1.8.txt_id_pairs.txt clusters_OrthoFinder_v%s_I1.8.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split() newFiles = [exampleBlastDir + fn for fn in newFiles] with CleanUp(newFiles, []): - stdout, stderr = self.RunOrthoFinder("-I 1.8 -b %s" % exampleBlastDir) - self.CheckStandardRun(stdout, stderr, baseDir + "ExpectedOutput/SmallExampleDataset_I1.8/", expectedCSVFile) + self.stdout, self.stderr = self.RunOrthoFinder("-I 1.8 -b %s -g" % exampleBlastDir) + self.CheckStandardRun(self.stdout, self.stderr, baseDir + "ExpectedOutput/SmallExampleDataset_I1.8/", expectedCSVFile) + self.test_passed = True # @unittest.skipIf(__skipLongTests__, "Only performing quick tests") # def test_fromblastOrthobench(self): @@ -282,21 +299,23 @@ class TestCommandLine(unittest.TestCase): # self.currentResultsDir = None # expectedNewFiles = [baseDir + "Input/Orthobench_blast/" + x for x in "OrthoFinder_v0.4.0_graph.txt clusters_OrthoFinder_v0.4.0_I1.5.txt clusters_OrthoFinder_v0.4.0_I1.5.txt_id_pairs.txt".split()] # with CleanUp(expectedNewFiles, []): -# stdout, stderr = self.RunOrthoFinder("-b %sInput/Orthobench_blast" % baseDir) -# self.CheckStandardRun(stdout, stderr, goldResultsDir_orthobench, expectedCSVFileLocation, qDelete = True) +# self.stdout, self.stderr = self.RunOrthoFinder("-b %sInput/Orthobench_blast" % baseDir) +# self.CheckStandardRun(stdout, stderr, goldResultsDir_orthobench, expectedCSVFileLocation, qDelete = True) +# self.test_passed = True def test_help(self): - stdout, stderr = self.RunOrthoFinder("") - self.assertTrue(expectedHelp in stdout) - self.assertEqual(len(stderr), 0) - - stdout, stderr = self.RunOrthoFinder("-h") - self.assertTrue(expectedHelp in stdout) - self.assertEqual(len(stderr), 0) + self.stdout, self.stderr = self.RunOrthoFinder("") + self.assertTrue(expectedHelp in self.stdout) + self.assertEqual(len(self.stderr), 0) + + self.stdout, self.stderr = self.RunOrthoFinder("-h") + self.assertTrue(expectedHelp in self.stdout) + self.assertEqual(len(self.stderr), 0) - stdout, stderr = self.RunOrthoFinder("--help") - self.assertTrue(expectedHelp in stdout) - self.assertEqual(len(stderr), 0) + self.stdout, self.stderr = self.RunOrthoFinder("--help") + self.assertTrue(expectedHelp in self.stdout) + self.assertEqual(len(self.stderr), 0) + self.test_passed = True # def test_numberOfThreads(self): # pass @@ -316,8 +335,9 @@ class TestCommandLine(unittest.TestCase): expectedChangedFiles = [exampleBlastDir + fn for fn in "SpeciesIDs.txt SequenceIDs.txt".split()] # cleanup afterwards including failed test goldDir = baseDir + "ExpectedOutput/AddOneSpecies/" - with CleanUp(expectedExtraFiles, expectedChangedFiles): - stdout, stderr = self.RunOrthoFinder("-b %s -f %s" % (exampleBlastDir, baseDir + "Input/ExtraFasta")) + expectedExtraDir = exampleBlastDir + "Orthologues_%s/" % datetime.date.today().strftime("%b%d") + with CleanUp(expectedExtraFiles, expectedChangedFiles, [expectedExtraDir]): + self.stdout, self.stderr = self.RunOrthoFinder("-b %s -f %s" % (exampleBlastDir, baseDir + "Input/ExtraFasta")) # check extra blast files # check extra fasta file: simple couple of checks to ensure all ok for fn in expectedExtraFiles: @@ -325,7 +345,8 @@ class TestCommandLine(unittest.TestCase): self.assertTrue(os.path.exists(fn), msg=fn) # mcl output files contain a variable header, these files are an implementation detail that I don't want to test (I want the final orthogroups to be correct) if "clusters" in os.path.split(fn)[1]: continue - self.CompareFile(goldDir + os.path.split(fn)[1], fn) + self.CompareFile(goldDir + os.path.split(fn)[1], fn) + self.test_passed = True def test_addTwoSpecies(self): expectedExtraFiles = [exampleBlastDir + fn for fn in ("Blast0_3.txt Blast3_0.txt Blast1_3.txt Blast3_1.txt Blast2_3.txt Blast3_2.txt Blast3_3.txt Species3.fa \ @@ -336,12 +357,13 @@ class TestCommandLine(unittest.TestCase): expectedChangedFiles = [exampleBlastDir + fn for fn in "SpeciesIDs.txt SequenceIDs.txt".split()] goldDir = baseDir + "ExpectedOutput/AddTwoSpecies/" with CleanUp(expectedExtraFiles, expectedChangedFiles): - stdout, stderr = self.RunOrthoFinder("-b %s -f %s" % (exampleBlastDir, baseDir + "Input/ExtraFasta2")) + self.stdout, self.stderr = self.RunOrthoFinder("-b %s -g -f %s" % (exampleBlastDir, baseDir + "Input/ExtraFasta2")) for fn in expectedExtraFiles: os.path.split(fn)[1] self.assertTrue(os.path.exists(fn), msg=fn) if "clusters" in os.path.split(fn)[1]: continue self.CompareFile(goldDir + os.path.split(fn)[1], fn) + self.test_passed = True def test_addTwoSpecies_blastsRequired(self): expectedExtraFiles = [exampleBlastDir + fn for fn in "Species3.fa Species4.fa".split()] @@ -349,18 +371,19 @@ class TestCommandLine(unittest.TestCase): expectedChangedFiles = [exampleBlastDir + fn for fn in "SpeciesIDs.txt SequenceIDs.txt".split()] # cleanup afterwards including failed test with CleanUp(expectedExtraFiles, expectedChangedFiles): - stdout, stderr = self.RunOrthoFinder("-b %s -f %s -p" % (exampleBlastDir, baseDir + "Input/ExtraFasta2")) + self.stdout, self.stderr = self.RunOrthoFinder("-b %s -f %s -p" % (exampleBlastDir, baseDir + "Input/ExtraFasta2")) original = [0, 1, 2] new = [3, 4] for i in new: for j in original: - assert("Blast%d_%d.txt" % (i,j) in stdout) - assert("Blast%d_%d.txt" % (j,i) in stdout) + assert("Blast%d_%d.txt" % (i,j) in self.stdout) + assert("Blast%d_%d.txt" % (j,i) in self.stdout) for j in new: - assert("Blast%d_%d.txt" % (i,j) in stdout) - assert("Blast%d_%d.txt" % (j,i) in stdout) + assert("Blast%d_%d.txt" % (i,j) in self.stdout) + assert("Blast%d_%d.txt" % (j,i) in self.stdout) - assert(stdout.count("blastp") == 2*len(new)*len(original) + len(new)**2) + assert(self.stdout.count("blastp") == 2*len(new)*len(original) + len(new)**2) + self.test_passed = True def test_removeFirstSpecies(self): self.RemoveSpeciesTest(baseDir + "Input/ExampleDataset_removeFirst/", @@ -379,10 +402,11 @@ class TestCommandLine(unittest.TestCase): requiredResults = [inputDir + fn for fn in "OrthologousGroups.csv OrthologousGroups_UnassignedGenes.csv OrthologousGroups.txt".split()] expectedExtraFiles = [inputDir + fn for fn in ("Statistics_PerSpecies.csv Statistics_Overall.csv OrthologousGroups_SpeciesOverlaps.csv clusters_OrthoFinder_v%s_I1.5.txt clusters_OrthoFinder_v%s_I1.5.txt_id_pairs.txt OrthoFinder_v%s_graph.txt" % (version, version, version)).split()] with CleanUp(expectedExtraFiles + requiredResults, []): - stdout, stderr = self.RunOrthoFinder("-b %s" % inputDir) + self.stdout, self.stderr = self.RunOrthoFinder("-b %s" % inputDir) for fn in requiredResults: self.assertTrue(os.path.exists(fn), msg=fn) - self.CompareFile(goldDir + os.path.split(fn)[1], fn) + self.CompareFile(goldDir + os.path.split(fn)[1], fn) + self.test_passed = True # def test_removeMultipleSpecies(self): # pass @@ -396,7 +420,7 @@ class TestCommandLine(unittest.TestCase): expectedChangedFiles = [inputDir + fn for fn in "SpeciesIDs.txt SequenceIDs.txt".split()] goldDir = baseDir + "ExpectedOutput/AddOneRemoveOne/" with CleanUp(expectedExtraFiles, expectedChangedFiles): - stdout, stderr = self.RunOrthoFinder("-b %s -f %s" % (inputDir, baseDir + "Input/ExampleDataset_addOneRemoveOne/ExtraFasta/")) + self.stdout, self.stderr = self.RunOrthoFinder("-b %s -f %s" % (inputDir, baseDir + "Input/ExampleDataset_addOneRemoveOne/ExtraFasta/")) # print(stdout) # print(stderr) for fn in expectedExtraFiles: @@ -405,7 +429,8 @@ class TestCommandLine(unittest.TestCase): if "OrthologousGroups" in os.path.split(fn)[1]: self.CompareFile(goldDir + os.path.split(fn)[1], fn) self.CompareFile(goldDir + "SpeciesIDs.txt", inputDir + "SpeciesIDs.txt") - self.CompareFile(goldDir + "SequenceIDs.txt", inputDir + "SequenceIDs.txt") + self.CompareFile(goldDir + "SequenceIDs.txt", inputDir + "SequenceIDs.txt") + self.test_passed = True # def test_addMultipleSpecies_prepare(selg): # pass @@ -434,7 +459,7 @@ class TestCommandLine(unittest.TestCase): goldDirs = [baseDir + "ExpectedOutput/SmallExampleDataset_trees/" + d + "/" for d in expectedDirs] nTrees = 427 with CleanUp([], [], newDirs): - stdout, stderr = self.RunTrees(baseDir + "Input/SmallExampleDataset_forTrees/Results_Jan28/") + self.stdout, self.stderr = self.RunTrees(baseDir + "Input/SmallExampleDataset_forTrees/Results_Jan28/") for i in xrange(nTrees): self.assertTrue(os.path.exists(newDirs[0] + "/OG%07d.fa" % i), msg=str(i)) self.assertTrue(os.path.exists(newDirs[1] + "/OG%07d_tree.txt" % i)) @@ -443,38 +468,42 @@ class TestCommandLine(unittest.TestCase): for goldFN in glob.glob(goldD + "*"): newFN = newD + os.path.split(goldFN)[1] self.assertTrue(os.path.exists(newFN), msg=goldFN) - self.CompareFile(goldFN, newFN) + self.CompareFile(goldFN, newFN) + self.test_passed = True def test_treesAfterOption_b(self): expectedDirs = "Alignments Trees Sequences".split() newDirs = [baseDir + "Input/SmallExampleDataset_forTreesFromBlast/" + d +"/" for d in expectedDirs] goldDirs = [baseDir + "ExpectedOutput/SmallExampleDataset_trees/" + d +"/" for d in expectedDirs] with CleanUp([], [], newDirs): - stdout, stderr = self.RunTrees(baseDir + "Input/SmallExampleDataset_forTreesFromBlast/") + self.stdout, self.stderr = self.RunTrees(baseDir + "Input/SmallExampleDataset_forTreesFromBlast/") for goldD, newD in zip(goldDirs, newDirs): for goldFN in glob.glob(goldD + "*"): newFN = newD + os.path.split(goldFN)[1] self.assertTrue(os.path.exists(newFN), msg=goldFN) if newD[:-1].rsplit("/", 1)[-1] == "Sequences": - self.CompareFile(goldFN, newFN) + self.CompareFile(goldFN, newFN) + self.test_passed = True def test_treesMultipleResults(self): expectedDirs = "Alignments Trees Sequences".split() newDirs = [baseDir + "Input/MultipleResults/Results_Jan28/WorkingDirectory/" + d +"/" for d in expectedDirs] goldDirs = [baseDir + "ExpectedOutput/MultipleResultsInDirectory/" + d +"/" for d in expectedDirs] with CleanUp([], [], newDirs): - stdout, stderr = self.RunTrees(baseDir + "Input/MultipleResults/Results_Jan28/WorkingDirectory/clusters_OrthoFinder_v0.4.0_I1.5.txt_id_pairs.txt") + self.stdout, self.stderr = self.RunTrees(baseDir + "Input/MultipleResults/Results_Jan28/WorkingDirectory/clusters_OrthoFinder_v0.4.0_I1.5.txt_id_pairs.txt") for goldD, newD in zip(goldDirs, newDirs): for goldFN in glob.glob(goldD + "*"): newFN = newD + os.path.split(goldFN)[1] self.assertTrue(os.path.exists(newFN), msg=goldFN) if newD[:-1].rsplit("/", 1)[-1] == "Sequences": - self.CompareFile(goldFN, newFN) + self.CompareFile(goldFN, newFN) + self.test_passed = True def test_treesSubset_unspecified(self): - stdout, stderr = self.RunTrees(baseDir + "/Input/Trees_OneSpeciesRemoved") - self.assertTrue("ERROR: Results from multiple OrthoFinder runs found" in stdout) - self.assertTrue("Please run with only one set of results in directories or specifiy the specific clusters_OrthoFinder_*.txt_id_pairs.txt file on the command line" in stdout) + self.stdout, self.stderr = self.RunTrees(baseDir + "/Input/Trees_OneSpeciesRemoved") + self.assertTrue("ERROR: Results from multiple OrthoFinder runs found" in self.stdout) + self.assertTrue("Please run with only one set of results in directories or specifiy the specific clusters_OrthoFinder_*.txt_id_pairs.txt file on the command line" in self.stdout) + self.test_passed = True def test_treesResultsChoice_subset(self): expectedDirs = "Alignments Trees Sequences".split() @@ -482,7 +511,7 @@ class TestCommandLine(unittest.TestCase): goldDirs = [baseDir + "ExpectedOutput/SmallExampleDataset_trees/" + d + "/" for d in expectedDirs] nTrees = 427 with CleanUp([], [], newDirs): - stdout, stderr = self.RunTrees(baseDir + "/Input/Trees_OneSpeciesRemoved/clusters_OrthoFinder_v0.6.1_I1.5_1.txt_id_pairs.txt") + self.stdout, self.stderr = self.RunTrees(baseDir + "/Input/Trees_OneSpeciesRemoved/clusters_OrthoFinder_v0.6.1_I1.5_1.txt_id_pairs.txt") for i in xrange(nTrees): self.assertTrue(os.path.exists(newDirs[0] + "/OG%07d.fa" % i), msg=str(i)) self.assertTrue(os.path.exists(newDirs[1] + "/OG%07d_tree.txt" % i)) @@ -490,7 +519,8 @@ class TestCommandLine(unittest.TestCase): newD = newDirs[2] for goldFN in glob.glob(goldD + "*"): newFN = newD + os.path.split(goldFN)[1] - self.assertTrue(os.path.exists(newFN), msg=goldFN) + self.assertTrue(os.path.exists(newFN), msg=goldFN) + self.test_passed = True # def test_treesResultsChoice_full(self): dirs = ["Trees/", "Alignments/", "Sequences/"] @@ -499,22 +529,21 @@ class TestCommandLine(unittest.TestCase): nExpected = [536, 536, 1330] fnPattern = [baseDir + "/Input/Trees_OneSpeciesRemoved/" + p for p in ["Trees/OG%07d_tree.txt", "Alignments/OG%07d.fa", "Sequences/OG%07d.fa"]] with CleanUp([], [], expectedDirs): - stdout, stderr = self.RunTrees(baseDir + "/Input/Trees_OneSpeciesRemoved/clusters_OrthoFinder_v0.6.1_I1.5.txt_id_pairs.txt") + self.stdout, self.stderr = self.RunTrees(baseDir + "/Input/Trees_OneSpeciesRemoved/clusters_OrthoFinder_v0.6.1_I1.5.txt_id_pairs.txt") for goldD, expD, n, fnPat in zip(goldDirs, expectedDirs, nExpected, fnPattern): for i in xrange(n): self.assertTrue(os.path.exists(fnPat % i)) # Only test the contents of a limited number for goldFN in glob.glob(goldD + "*"): newFN = expD + os.path.split(goldFN)[1] - self.CompareFile(goldFN, newFN) + self.CompareFile(goldFN, newFN) + self.test_passed = True def test_ProblemCharacters_issue28(self): - """Deal with problematic characters in accession names - """ # Can't have ( ) : , inputDir = baseDir + "Input/DisallowedCharacters/" with CleanUp([], [], [inputDir + x for x in "Trees/ Sequences/ Alignments/".split()]): - stdout, stderr = self.RunTrees(inputDir) + self.stdout, self.stderr = self.RunTrees(inputDir) fn = inputDir + "Trees/OG0000000_tree.txt" self.assertTrue(os.path.exists(fn)) with open(fn, 'rb') as infile: @@ -523,7 +552,8 @@ class TestCommandLine(unittest.TestCase): self.assertTrue("(" in l) self.assertTrue(")" in l) self.assertTrue("," in l) - self.assertTrue(";" in l) + self.assertTrue(";" in l) + self.test_passed = True # def test_treesExtraSpecies(self): @@ -531,9 +561,15 @@ class TestCommandLine(unittest.TestCase): def setUp(self): self.currentResultsDir = None + self.test_passed = False + self.stdout = None + self.stderr = None def tearDown(self): self.CleanCurrentResultsDir() + if not self.test_passed: + print(self.stdout) + print(self.stderr) def RunOrthoFinder(self, commands): if qBinary: @@ -634,16 +670,15 @@ if __name__ == "__main__": if len(sys.argv) > 2: test = sys.argv[2] else: - test = sys.argv[1] - - if test == None: - suite = unittest.TestLoader().loadTestsFromTestCase(TestCommandLine) - unittest.TextTestRunner(verbosity=2).run(suite) - else: - suite = unittest.TestSuite() - suite.addTest(TestCommandLine(test)) - runner = unittest.TextTestRunner() - runner.run(suite) + test = sys.argv[1] + if test == None: + suite = unittest.TestLoader().loadTestsFromTestCase(TestCommandLine) + unittest.TextTestRunner(verbosity=2).run(suite) + else: + suite = unittest.TestSuite() + suite.addTest(TestCommandLine(test)) + runner = unittest.TextTestRunner() + runner.run(suite) diff --git a/ExampleDataset/Mycoplasma_agalactiae_5632_FP671138.faa b/orthofinder/ExampleDataset/Mycoplasma_agalactiae_5632_FP671138.faa similarity index 100% rename from ExampleDataset/Mycoplasma_agalactiae_5632_FP671138.faa rename to orthofinder/ExampleDataset/Mycoplasma_agalactiae_5632_FP671138.faa diff --git a/ExampleDataset/Mycoplasma_gallisepticum_uid409_AE015450.faa b/orthofinder/ExampleDataset/Mycoplasma_gallisepticum_uid409_AE015450.faa similarity index 100% rename from ExampleDataset/Mycoplasma_gallisepticum_uid409_AE015450.faa rename to orthofinder/ExampleDataset/Mycoplasma_gallisepticum_uid409_AE015450.faa diff --git a/ExampleDataset/Mycoplasma_genitalium_uid97_L43967.faa b/orthofinder/ExampleDataset/Mycoplasma_genitalium_uid97_L43967.faa similarity index 100% rename from ExampleDataset/Mycoplasma_genitalium_uid97_L43967.faa rename to orthofinder/ExampleDataset/Mycoplasma_genitalium_uid97_L43967.faa diff --git a/ExampleDataset/Mycoplasma_hyopneumoniae_AE017243.faa b/orthofinder/ExampleDataset/Mycoplasma_hyopneumoniae_AE017243.faa similarity index 100% rename from ExampleDataset/Mycoplasma_hyopneumoniae_AE017243.faa rename to orthofinder/ExampleDataset/Mycoplasma_hyopneumoniae_AE017243.faa diff --git a/License.md b/orthofinder/License.md similarity index 100% rename from License.md rename to orthofinder/License.md diff --git a/README.md b/orthofinder/README.md similarity index 95% rename from README.md rename to orthofinder/README.md index 4bdfca734fbe4809367ad7aa69479bd01d0220c8..7479499aa7baa3dd1282e040366aa2d4e832bad1 100644 --- a/README.md +++ b/orthofinder/README.md @@ -1,4 +1,7 @@ # OrthoFinder — Accurate inference of orthogroups, orthologues, gene trees and rooted species tree made easy! + + + What does OrthoFinder do? ========== OrthoFinder is a fast, accurate and comprehensive analysis tool for comparative genomics. It finds orthologs, orthogroups, infers gene trees for all orthogroups and infers a species tree for the species being analysed. OrthoFinder also identifies the root of the species tree and provides lots of useful statistics for comparative genomic analyses. OrthoFinder is very simple to use and all you need to run it is a set of protein sequence files (one per species) in FASTA format . @@ -25,7 +28,7 @@ What's New **Jan. 2016**: Added the ability to **add and remove species**. -**Sept. 2015**: Added the **trees_for_orthogroups.py** utility to automatically calculate multiple sequence alignments and gene trees for the orthogroups calcualted using OrthoFinder. +**Sept. 2015**: Added the **trees_from_MSA.py** utility to automatically calculate multiple sequence alignments and gene trees for the orthogroups calcualted using OrthoFinder. Usage ===== @@ -75,7 +78,7 @@ Most of the terms in the files **Statistics_Overall.csv** and **Statistics_PerSp - Species-specific orthogroup: An orthogroups that consist entirely of genes from one species. - G50: The number of genes in the orthogroup such that 50% of genes are in orthogroups of that size or larger. - O50: The smallest number of orthogroups such that 50% of genes are in orthogroups of that size or larger. -- Single-copy orthogroup: An orthogroup with exactly one gene (and no more) from each species. These orthogroups are ideal for inferring a species tree. Note that trees for all orthogroups can be generated using the trees_for_orthogroups.py script. +- Single-copy orthogroup: An orthogroup with exactly one gene (and no more) from each species. These orthogroups are ideal for inferring a species tree. Note that trees for all orthogroups can be generated using the trees_from_MSA.py script. - Unassigned gene: A gene that has not been put into an orthogroup with any other genes. ###Orthologues, Gene Trees & Rooted Species Tree @@ -97,7 +100,7 @@ After the orthogroups have been calculated OrthoFinder will infer gene trees, th 2. DLCpar-search -To use the trees_for_orthogroups.py utility, which infers trees using multiple sequence alignments, there are two additional dependencies which should be installed and in the system path: +To use the trees_from_MSA.py utility, which infers trees using multiple sequence alignments, there are two additional dependencies which should be installed and in the system path: 1. MAFFT @@ -296,9 +299,9 @@ Information on the orthoxml format can be found here: http://orthoxml.org/0.3/or Trees for Orthogroups ===================== -The 'trees_for_orthogroups.py' utility will automatically generate multiple sequence alignments and gene trees for each orthologous group generated by OrthoFinder. For example, once OrthoFinder has been run on the example dataset, trees_for_orthogroups can be run using: +The 'trees_from_MSA.py' utility will automatically generate multiple sequence alignments and gene trees for each orthologous group generated by OrthoFinder. For example, once OrthoFinder has been run on the example dataset, trees_from_MSA can be run using: -**python trees_for_orthogroups.py ExampleDataset/Results_\<date\> -t 16** +**python trees_from_MSA.py ExampleDataset/Results_\<date\> -t 16** where "ExampleDataset/Results_\<date\>" is the results directory from running OrthoFinder. diff --git a/orthofinder/Workflow.png b/orthofinder/Workflow.png new file mode 100755 index 0000000000000000000000000000000000000000..59fd36fe4d04ac499def48406b8eb8e8ee9bad30 Binary files /dev/null and b/orthofinder/Workflow.png differ diff --git a/orthofinder/__init__.py b/orthofinder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..120282a02ad8d3d571f675638a0a9ea962579105 --- /dev/null +++ b/orthofinder/__init__.py @@ -0,0 +1 @@ +__all__ = ["orthofinder", "scripts"] diff --git a/orthofinder.py b/orthofinder/orthofinder.py similarity index 71% rename from orthofinder.py rename to orthofinder/orthofinder.py index b4e72a92576d5135d79d1e3b02aeff8f51762955..1aab96e2d09b5786d70293449aff0b2f77c4b47f 100755 --- a/orthofinder.py +++ b/orthofinder/orthofinder.py @@ -25,9 +25,6 @@ # For any enquiries send an email to David Emms # david_emms@hotmail.com -nThreadsDefault = 16 -nAlgDefault = 1 - import sys # Y import subprocess # Y import os # Y @@ -42,30 +39,23 @@ import csv # Y import scipy.sparse as sparse # install import os.path # Y import numpy.core.numeric as numeric # install -import cPickle as pic # Y -from collections import defaultdict, namedtuple # Y +from collections import defaultdict # Y import xml.etree.ElementTree as ET # Y from xml.etree.ElementTree import SubElement # Y from xml.dom import minidom # Y import Queue # Y import warnings # Y -import time # Y +import scripts.mcl as MCLread +import scripts.blast_file_processor as BlastFileProcessor +from scripts import util, matrices, get_orthologues -version = "1.0.0" fastaExtensions = {"fa", "faa", "fasta", "fas"} -picProtocol = 1 if sys.platform.startswith("linux"): with open(os.devnull, "w") as f: subprocess.call("taskset -p 0xffffffffffff %d" % os.getpid(), shell=True, stdout=f) # get round problem with python multiprocessing library that can set all cpu affinities to a single cpu - -""" -Utilities -------------------------------------------------------------------------------- -""" -def RunCommand(command): - subprocess.call(command) - + + def RunBlastDBCommand(command): capture = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout = [x for x in capture.stdout] @@ -75,162 +65,7 @@ def RunBlastDBCommand(command): print("\nWarning:") print("".join(stdout[2:])) if len(stderr) > 0: print(stderr) - -def Worker_RunCommand(cmd_queue, nTotal): - while True: - try: - iCommand, command = cmd_queue.get(True, 1) - util.PrintTime("Running Blast %d of %d" % (iCommand, nTotal)) - subprocess.call(command) - util.PrintTime("Finished Blast %d of %d" % (iCommand, nTotal)) - except Queue.Empty: - return - -class util: - @staticmethod - def GetDirectoryName(baseDirName, dateString, i): - if i == 0: - return baseDirName + dateString + os.sep - else: - return baseDirName + dateString + ("_%d" % i) + os.sep - - """Call GetNameForNewWorkingDirectory before a call to CreateNewWorkingDirectory to find out what directory will be created""" - @staticmethod - def CreateNewWorkingDirectory(baseDirectoryName): - dateStr = datetime.date.today().strftime("%b%d") - iAppend = 0 - newDirectoryName = util.GetDirectoryName(baseDirectoryName, dateStr, iAppend) - while os.path.exists(newDirectoryName): - iAppend += 1 - newDirectoryName = util.GetDirectoryName(baseDirectoryName, dateStr, iAppend) - os.mkdir(newDirectoryName) - return newDirectoryName - - @staticmethod - def GetUnusedFilename(baseFilename, ext): - iAppend = 0 - newFilename = baseFilename + ext - while os.path.exists(newFilename): - iAppend += 1 - newFilename = baseFilename + ("_%d" % iAppend) + ext - return newFilename, iAppend - - @staticmethod - def PrintTime(message): - print(str(datetime.datetime.now()).rsplit(".", 1)[0] + " : " + message) - - @staticmethod - def SortArrayPairByFirst(useForSortAr, keepAlignedAr, qLargestFirst=False): - sortedTuples = sorted(zip(useForSortAr, keepAlignedAr), reverse=qLargestFirst) - useForSortAr = [i for i, j in sortedTuples] - keepAlignedAr = [j for i, j in sortedTuples] - return useForSortAr, keepAlignedAr - - @staticmethod - def PrintNoNewLine(text): - sys.stdout.write(text) - - @staticmethod - def SortFastaFilenames(fastaFilenames): - speciesIndices = [] - for f in fastaFilenames: - start = f.rfind("Species") - speciesIndices.append(int(f[start+7:-3])) - indices, sortedFasta = util.SortArrayPairByFirst(speciesIndices, fastaFilenames) - return sortedFasta - -def Fail(): - print("ERROR: An error occurred, please review previous error messages for more information.") - sys.exit() - -def ManageQueue(runningProcesses, cmd_queue): - """Manage a set of runningProcesses working through cmd_queue. - If there is an error the exit all processes as quickly as possible and - exit via Fail() methods. Otherwise return when all work is complete - """ - # set all completed processes to None - qError = False -# dones = [False for _ in runningProcesses] - nProcesses = len(runningProcesses) - while True: - if runningProcesses.count(None) == len(runningProcesses): break - time.sleep(2) -# for proc in runningProcesses: - for i in xrange(nProcesses): - proc = runningProcesses[i] - if proc == None: continue - if not proc.is_alive(): - if proc.exitcode != 0: - qError = True - while True: - try: - cmd_queue.get(True, 1) - except Queue.Empty: - break - runningProcesses[i] = None - if qError: - DeleteMatrices(fileInfo) - Fail() - -""" -IDExtractor -------------------------------------------------------------------------------- -""" -class IDExtractor(object): - """IDExtractor deals with the fact that for different datasets a user will - want to extract a unique sequence ID from the fasta file accessions uin different - ways.""" - def GetIDToNameDict(self): - raise NotImplementedError("Should not be implemented") - def GetNameToIDDict(self): - raise NotImplementedError("Should not be implemented") - -class FullAccession(IDExtractor): - def __init__(self, idsFilename): - # only want the first part and nothing else (easy!) - self.idToNameDict = dict() - self.nameToIDDict = dict() - with open(idsFilename, 'rb') as idsFile: - for line in idsFile: - if line.startswith("#"): continue - id, accession = line.rstrip().split(": ", 1) - # Replace problematic characters - accession = accession.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") - if id in self.idToNameDict: - raise RuntimeError("ERROR: A duplicate id was found in the fasta files: % s" % id) - self.idToNameDict[id] = accession - self.nameToIDDict[accession] = id - - def GetIDToNameDict(self): - return self.idToNameDict - - def GetNameToIDDict(self): - return self.nameToIDDict - -class FirstWordExtractor(IDExtractor): - def __init__(self, idsFilename): - # only want the first part and nothing else (easy!) - self.idToNameDict = dict() - self.nameToIDDict = dict() - with open(idsFilename, 'rb') as idsFile: - for line in idsFile: - id, rest = line.split(": ", 1) - accession = rest.split(None, 1)[0] - # Replace problematic characters - accession = accession.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") - if accession in self.nameToIDDict: - raise RuntimeError("A duplicate accession was found using just first part: % s" % accession) - if id in self.idToNameDict: - raise RuntimeError("ERROR: A duplicate id was found in the fasta files: % s" % id) - self.idToNameDict[id] = accession - self.nameToIDDict[accession] = id - - def GetIDToNameDict(self): - return self.idToNameDict - - def GetNameToIDDict(self): - return self.nameToIDDict - + def SpeciesNameDict(speciesIDsFN): speciesNamesDict = dict() with open(speciesIDsFN, 'rb') as speciesNamesFile: @@ -243,92 +78,9 @@ def SpeciesNameDict(speciesIDsFN): """ MCL ------------------------------------------------------------------------------- -""" - -class MCL: - @staticmethod - def GetPredictedOGs(clustersFilename): - predictedOGs = [] - nOGsString = "" - qContainsProfiles = False - with open(clustersFilename, 'rb') as clusterFile: - header = True - og = set() - for line in clusterFile: - if header: - if line.count("begin"): - header = False - else: - if line.find(")") != -1: - break - if line[-2] == "$": - line = line[:-3] - if line[0] == " ": - # continuation of group - x = line.split() - y = [x_ for x_ in x if not x_.startswith('Prof')] - og = og.union(y) - else: - # new OG - if len(og) != 0: - predictedOGs.append(og) - nOGsString, line = line.split(" ", 1) - x = line.split() - y = [x_ for x_ in x if not x_.startswith('Prof')] - if len(x) != len(y): - qContainsProfiles = True - og = set(y) - if len(og) > 0: - predictedOGs.append(og) - if not qContainsProfiles: - assert(len(predictedOGs) == int(nOGsString) + 1) - return predictedOGs - - @staticmethod - def GetSingleID(speciesStartingIndices, seq, speciesToUse): - iSpecies, iSeq = map(int, seq.split("_")) - offset = speciesStartingIndices[speciesToUse.index(iSpecies)] - return iSeq + offset - - @staticmethod - def GetIDPair(speciesStartingIndices, singleID, speciesToUse): - for i, startingIndex in enumerate(speciesStartingIndices): - if startingIndex > singleID: - return "%d_%d" % (speciesToUse[i-1], singleID - speciesStartingIndices[i-1]) - return "%d_%d" % (speciesToUse[-1], singleID - speciesStartingIndices[len(speciesStartingIndices)-1]) +""" - @staticmethod - def ConvertSingleIDsToIDPair(seqsInfo, clustersFilename, newFilename): - with open(clustersFilename, 'rb') as clusterFile, open(newFilename, "wb") as output: - header = True - for line in clusterFile: - appendDollar = False - initialText = "" - idsString = "" - ids = [] - if header: - output.write(line) - if line.count("begin"): - header = False - else: - if line.find(")") != -1: - output.write(line) - break - if line[-2] == "$": - line = line[:-3] - appendDollar = True - if line[0] != " ": - initialText, line = line.split(None, 1) - # continuation of group - ids = line.split() - for id in ids: - idsString += MCL.GetIDPair(seqsInfo.seqStartingIndices, int(id), seqsInfo.speciesToUse) + " " - output.write(initialText + " " + idsString) - if appendDollar: - output.write("$\n") - else: - output.write("\n") - +class MCL: @staticmethod def CreateOGs(predictedOGs, outputFilename, idDict): with open(outputFilename, 'wb') as outputFile: @@ -338,30 +90,28 @@ class MCL: outputFile.write(" ".join(accessions)) outputFile.write("\n") - @staticmethod + @staticmethod def prettify(elem): """Return a pretty-printed XML string for the Element. """ rough_string = ET.tostring(elem, 'utf-8') reparsed = minidom.parseString(rough_string) return reparsed.toprettyxml(indent=" ") - - @staticmethod + + @staticmethod def WriteOrthoXML(speciesInfo, predictedOGs, numbersOfSequences, idDict, orthoxmlFilename, speciesToUse): """ speciesInfo: ordered array for which each element has fastaFilename, speciesName, NCBITaxID, sourceDatabaseName, databaseVersionFastaFile """ - - # Write OrthoXML file root = ET.Element("orthoXML") root.set('xsi:schemaLocation', "http://orthoXML.org/2011/ http://www.orthoxml.org/0.3/orthoxml.xsd") - root.set('originVersion', version) + root.set('originVersion', util.version) root.set('origin', 'OrthoFinder') root.set('version', "0.3") root.set('xmlns:xsi', "http://www.w3.org/2001/XMLSchema-instance") #notes = SubElement(root, 'notes') - + # Species: details of source of genomes and sequences they contain speciesStartingIndices = [] iGene_all = 0 @@ -372,80 +122,80 @@ class MCL: speciesDatabaseNode = SubElement(speciesNode, "database") speciesDatabaseNode.set('name', thisSpeciesInfo[3]) # required speciesDatabaseNode.set('version', thisSpeciesInfo[4]) # required -# speciesDatabaseNode.set('geneLink', "") # skip -# speciesDatabaseNode.set('protLink', "") # skip -# speciesDatabaseNode.set('transcriptLink', "") # skip + # speciesDatabaseNode.set('geneLink', "") # skip + # speciesDatabaseNode.set('protLink', "") # skip + # speciesDatabaseNode.set('transcriptLink', "") # skip allGenesNode = SubElement(speciesDatabaseNode, "genes") speciesStartingIndices.append(iGene_all) for iGene_species in xrange(nSeqs): geneNode = SubElement(allGenesNode, 'gene') geneNode.set("geneId", idDict["%d_%d" % (iSpecies , iGene_species)]) geneNode.set('id', str(iGene_all)) # required -# geneNode.set("protID", "") # skip + # geneNode.set("protID", "") # skip iGene_all += 1 # Scores tag - unused -# scoresNode = SubElement(root, 'scores') # skip - + # scoresNode = SubElement(root, 'scores') # skip + # Orthogroups allGroupsNode = SubElement(root, 'groups') for iOg, og in enumerate(predictedOGs): groupNode = SubElement(allGroupsNode, 'orthologGroup') groupNode.set('id', str(iOg)) -# groupScoreNode = SubElement(groupNode, 'score') # skip -# groupScoreNode.set('id', "") # skip -# groupScoreNode.set('value', "") # skip -# SubElement(groupNode, 'property') # skip + # groupScoreNode = SubElement(groupNode, 'score') # skip + # groupScoreNode.set('id', "") # skip + # groupScoreNode.set('value', "") # skip + # SubElement(groupNode, 'property') # skip for seq in og: geneNode = SubElement(groupNode, 'geneRef') - geneNode.set('id', str(MCL.GetSingleID(speciesStartingIndices, seq, speciesToUse))) -# SubElement(geneNode, 'score') # skip + geneNode.set('id', str(MCLread.GetSingleID(speciesStartingIndices, seq, speciesToUse))) + # SubElement(geneNode, 'score') # skip with open(orthoxmlFilename, 'wb') as orthoxmlFile: -# ET.ElementTree(root).write(orthoxmlFile) + # ET.ElementTree(root).write(orthoxmlFile) orthoxmlFile.write(MCL.prettify(root)) print("Orthologous groups have been written to orthoxml file:\n %s" % orthoxmlFilename) - - @staticmethod + + @staticmethod def RunMCL(graphFilename, clustersFilename, nProcesses, inflation): nProcesses = 4 if nProcesses > 4 else nProcesses # MCL appears to take *longer* as more than 4 processes are used command = ["mcl", graphFilename, "-I", str(inflation), "-o", clustersFilename, "-te", str(nProcesses), "-V", "all"] - RunCommand(command) + util.RunCommand(command) util.PrintTime("Ran MCL") - - @staticmethod + + @staticmethod def WriteOrthogroupFiles(ogs, idsFilenames, resultsBaseFilename, clustersFilename_pairs): outputFN = resultsBaseFilename + ".txt" try: fullDict = dict() for idsFilename in idsFilenames: - idExtract = FirstWordExtractor(idsFilename) + idExtract = util.FirstWordExtractor(idsFilename) idDict = idExtract.GetIDToNameDict() fullDict.update(idDict) MCL.CreateOGs(ogs, outputFN, fullDict) except KeyError as e: sys.stderr.write("ERROR: Sequence ID not found in %s\n" % idsFilename) sys.stderr.write(str(e) + "\n") - Fail() + util.Fail() except RuntimeError as error: print(error.message) if error.message.startswith("ERROR"): print("ERROR: %s contains a duplicate ID. The IDs for the orthologous groups in %s will not be replaced with the sequence accessions. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, clustersFilename_pairs, idsFilename)) - Fail() + util.Fail() else: print("Tried to use only the first part of the accession in order to list the sequences in each orthologous group\nmore concisely but these were not unique. The full accession line will be used instead.\n") try: fullDict = dict() for idsFilename in idsFilenames: - idExtract = FullAccession(idsFilename) + idExtract = util.tFullAccession(idsFilename) idDict = idExtract.GetIDToNameDict() fullDict.update(idDict) MCL.CreateOGs(ogs, outputFN, fullDict) except: print("ERROR: %s contains a duplicate ID. The IDs for the orthologous groups in %s will not be replaced with the sequence accessions. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, clustersFilename_pairs, idsFilename)) - Fail() + util.Fail() return fullDict - - @staticmethod + + @staticmethod def CreateOrthogroupTable(ogs, idToNameDict, speciesNamesDict, @@ -549,19 +299,10 @@ RunInfo ------------------------------------------------------------------------------- """ -SequencesInfo = namedtuple("SequencesInfo", "nSeqs nSpecies speciesToUse seqStartingIndices") -FileInfo = namedtuple("FileInfo", "inputDir outputDir graphFilename") - -def GetIDPairFromString(line): - return map(int, line.split("_")) - -def NumberOfSequences(seqsInfo, iSpecies): - return (seqsInfo.seqStartingIndices[iSpecies+1] if iSpecies != seqsInfo.nSpecies-1 else seqsInfo.nSeqs) - seqsInfo.seqStartingIndices[iSpecies] - def GetSequenceLengths(seqsInfo, fileInfo): sequenceLengths = [] for iSpecies, iFasta in enumerate(seqsInfo.speciesToUse): - sequenceLengths.append(np.zeros(NumberOfSequences(seqsInfo, iSpecies))) + sequenceLengths.append(np.zeros(BlastFileProcessor.NumberOfSequences(seqsInfo, iSpecies))) fastaFilename = fileInfo.inputDir + "Species%d.fa" % iFasta currentSequenceLength = 0 iCurrentSequence = -1 @@ -574,7 +315,7 @@ def GetSequenceLengths(seqsInfo, fileInfo): else: sequenceLengths[iSpecies][iCurrentSequence] = currentSequenceLength currentSequenceLength = 0 - _, iCurrentSequence = GetIDPairFromString(row[1:]) + _, iCurrentSequence = util.GetIDPairFromString(row[1:]) else: currentSequenceLength += len(row.rstrip()) sequenceLengths[iSpecies][iCurrentSequence] = currentSequenceLength @@ -588,30 +329,6 @@ def GetNumberOfSequencesInFile(filename): if line.startswith(">"): count+=1 return count -# Get Info from seqs IDs file? -def GetSeqsInfo(inputDirectory, speciesToUse): - seqStartingIndices = [0] - nSeqs = 0 - for i, iFasta in enumerate(speciesToUse): - fastaFilename = inputDirectory + "Species%d.fa" % iFasta - with open(fastaFilename) as infile: - for line in infile: - if len(line) > 1 and line[0] == ">": - nSeqs+=1 - seqStartingIndices.append(nSeqs) - seqStartingIndices = seqStartingIndices[:-1] - nSpecies = len(speciesToUse) - return SequencesInfo(nSeqs=nSeqs, nSpecies=nSpecies, speciesToUse=speciesToUse, seqStartingIndices=seqStartingIndices) - -def GetSpeciesToUse(speciesIDsFN): - speciesToUse = [] - with open(speciesIDsFN, 'rb') as speciesF: - for line in speciesF: - if len(line) == 0 or line[0] == "#": continue - speciesToUse.append(int(line.split(":")[0])) - return speciesToUse - - """ Question: Do I want to do all BLASTs or just the required ones? It's got to be all BLASTs I think. They could potentially be run after the clustering has finished.""" def GetOrderedBlastCommands(seqsInfo, previousFastaFiles, newFastaFiles, workingDir): @@ -629,138 +346,55 @@ def GetOrderedBlastCommands(seqsInfo, previousFastaFiles, newFastaFiles, working taskSizes, speciesPairs = util.SortArrayPairByFirst(taskSizes, speciesPairs, True) commands = [["blastp", "-outfmt", "6", "-evalue", "0.001", "-query", workingDir + "Species%d.fa" % iFasta, "-db", workingDir + "BlastDBSpecies%d" % iDB, "-out", "%sBlast%d_%d.txt" % (workingDir, iFasta, iDB)] for iFasta, iDB in speciesPairs] - return commands - -""" -BlastFileProcessor -------------------------------------------------------------------------------- -""" -class BlastFileProcessor(object): - @staticmethod - def GetBH_s(pairwiseScoresMatrices, seqsInfo, iSpecies, tol=1e-3): - nSeqs_i = NumberOfSequences(seqsInfo, iSpecies) - bestHitForSequence = -1*np.ones(nSeqs_i) - H = [None for i_ in xrange(seqsInfo.nSpecies)] # create array of Nones to be replace by matrices - for j in xrange(seqsInfo.nSpecies): - if iSpecies == j: - # identify orthologs then come back to paralogs - continue - W = pairwiseScoresMatrices[j] - I = [] - J = [] - for kRow in xrange(nSeqs_i): - values=W.getrowview(kRow) - if values.nnz == 0: - continue - m = max(values.data[0]) - bestHitForSequence[kRow] = m if m > bestHitForSequence[kRow] else bestHitForSequence[kRow] - # get all above this value with tolerance - temp = [index for index, value in zip(values.rows[0], values.data[0]) if value > m - tol] - J.extend(temp) - I.extend(kRow * np.ones(len(temp), dtype=np.dtype(int))) - H[j] = sparse.csr_matrix((np.ones(len(I)), (I, J)), shape=W.get_shape()) - # now look for paralogs - I = [] - J = [] - W = pairwiseScoresMatrices[iSpecies] - for kRow in xrange(nSeqs_i): - values=W.getrowview(kRow) - if values.nnz == 0: - continue - temp = [index for index, value in zip(values.rows[0], values.data[0]) if value > bestHitForSequence[kRow] - tol] - J.extend(temp) - I.extend(kRow * np.ones(len(temp), dtype=np.dtype(int))) - H[iSpecies] = sparse.csr_matrix((np.ones(len(I)), (I, J)), shape=W.get_shape()) - return H - - @staticmethod - def GetBLAST6Scores(seqsInfo, fileInfo, iSpecies, jSpecies, qExcludeSelfHits = True, sep = "_"): - nSeqs_i = NumberOfSequences(seqsInfo, iSpecies) - nSeqs_j = NumberOfSequences(seqsInfo, jSpecies) - B = sparse.lil_matrix((nSeqs_i, nSeqs_j)) - row = "" - try: - with open(fileInfo.inputDir + "Blast%d_%d.txt" % (seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies]), 'rb') as blastfile: - blastreader = csv.reader(blastfile, delimiter='\t') - for row in blastreader: - # Get hit and query IDs - try: - species1ID, sequence1ID = map(int, row[0].split(sep, 1)) - species2ID, sequence2ID = map(int, row[1].split(sep, 1)) - except (IndexError, ValueError): - sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n") - raise - # Get bit score for pair - try: - score = float(row[11]) - except (IndexError, ValueError): - sys.stderr.write("\nERROR: 12th field in BLAST results file line should be the bit-score for the hit\n") - raise - if (qExcludeSelfHits and species1ID == species2ID and sequence1ID == sequence2ID): - continue - # store bit score - try: - if score > B[sequence1ID, sequence2ID]: - B[sequence1ID, sequence2ID] = score - except IndexError: - def ord(n): - return str(n)+("th" if 4<=n%100<=20 else {1:"st",2:"nd",3:"rd"}.get(n%10, "th")) -# sys.stderr.write("\nError in input files, expected only %d sequences in species %d and %d sequences in species %d but found a hit in the Blast%d_%d.txt between sequence %d_%d (i.e. %s sequence in species) and sequence %d_%d (i.e. %s sequence in species)\n" % (nSeqs_i, iSpecies, nSeqs_j, jSpecies, iSpecies, jSpecies, iSpecies, sequence1ID, ord(sequence1ID+1), jSpecies, sequence2ID, ord(sequence2ID+1))) - sys.stderr.write("\nERROR: Inconsistent input files.\n") - kSpecies, nSeqs_k, sequencekID = (iSpecies, nSeqs_i, sequence1ID) if sequence1ID >= nSeqs_i else (jSpecies, nSeqs_j, sequence2ID) - sys.stderr.write("Species%d.fa contains only %d sequences " % (kSpecies, nSeqs_k)) - sys.stderr.write("but found a query/hit in the Blast%d_%d.txt for sequence %d_%d (i.e. %s sequence in species %d).\n" % (iSpecies, jSpecies, kSpecies, sequencekID, ord(sequencekID+1), kSpecies)) - sys.exit() - except Exception: - sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (fileInfo.inputDir, seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies])) - sys.stderr.write("\t".join(row) + "\n") - sys.exit() - return B + return commands """ Matrices ------------------------------------------------------------------------------- """ -def DumpMatrix(name, m, fileInfo, iSpecies, jSpecies): - with open(fileInfo.outputDir + "%s%d_%d.pic" % (name, iSpecies, jSpecies), 'wb') as picFile: - pic.dump(m, picFile, protocol=picProtocol) - -def DumpMatrixArray(name, matrixArray, fileInfo, iSpecies): - for jSpecies, m in enumerate(matrixArray): - DumpMatrix(name, m, fileInfo, iSpecies, jSpecies) - def DeleteMatrices(fileInfo): for f in glob.glob(fileInfo.outputDir + "B*_*.pic"): if os.path.exists(f): os.remove(f) for f in glob.glob(fileInfo.outputDir + "connect*_*.pic"): if os.path.exists(f): os.remove(f) - -def LoadMatrix(name, fileInfo, iSpecies, jSpecies): - with open(fileInfo.outputDir + "%s%d_%d.pic" % (name, iSpecies, jSpecies), 'rb') as picFile: - M = pic.load(picFile) - return M - -def LoadMatrixArray(name, fileInfo, seqsInfo, iSpecies, row=True): - matrixArray = [] - for jSpecies in xrange(seqsInfo.nSpecies): - if row == True: - matrixArray.append(LoadMatrix(name, fileInfo, iSpecies, jSpecies)) - else: - matrixArray.append(LoadMatrix(name, fileInfo, jSpecies, iSpecies)) - return matrixArray - -def MatricesAnd_s(Xarr, Yarr): - Zarr = [] - for x, y in zip(Xarr, Yarr): - Zarr.append(x.multiply(y)) - return Zarr - -def MatricesAndTr_s(Xarr, Yarr): - Zarr = [] - for x, y in zip(Xarr, Yarr): - Zarr.append(x.multiply(y.transpose())) - return Zarr + +def GetBH_s(pairwiseScoresMatrices, seqsInfo, iSpecies, tol=1e-3): + nSeqs_i = BlastFileProcessor.NumberOfSequences(seqsInfo, iSpecies) + bestHitForSequence = -1*np.ones(nSeqs_i) + H = [None for i_ in xrange(seqsInfo.nSpecies)] # create array of Nones to be replace by matrices + for j in xrange(seqsInfo.nSpecies): + if iSpecies == j: + # identify orthologs then come back to paralogs + continue + W = pairwiseScoresMatrices[j] + I = [] + J = [] + for kRow in xrange(nSeqs_i): + values=W.getrowview(kRow) + if values.nnz == 0: + continue + m = max(values.data[0]) + bestHitForSequence[kRow] = m if m > bestHitForSequence[kRow] else bestHitForSequence[kRow] + # get all above this value with tolerance + temp = [index for index, value in zip(values.rows[0], values.data[0]) if value > m - tol] + J.extend(temp) + I.extend(kRow * np.ones(len(temp), dtype=np.dtype(int))) + H[j] = sparse.csr_matrix((np.ones(len(I)), (I, J)), shape=W.get_shape()) + # now look for paralogs + I = [] + J = [] + W = pairwiseScoresMatrices[iSpecies] + for kRow in xrange(nSeqs_i): + values=W.getrowview(kRow) + if values.nnz == 0: + continue + temp = [index for index, value in zip(values.rows[0], values.data[0]) if value > bestHitForSequence[kRow] - tol] + J.extend(temp) + I.extend(kRow * np.ones(len(temp), dtype=np.dtype(int))) + H[iSpecies] = sparse.csr_matrix((np.ones(len(I)), (I, J)), shape=W.get_shape()) + return H + """ WaterfallMethod @@ -773,14 +407,14 @@ def WriteGraph_perSpecies(args): with open(fileInfo.graphFilename + "_%d" % iSpec, 'wb') as graphFile: connect2 = [] for jSpec in xrange(seqsInfo.nSpecies): - m1 = LoadMatrix("connect", fileInfo, iSpec, jSpec) - m2tr = numeric.transpose(LoadMatrix("connect", fileInfo, jSpec, iSpec)) + m1 = matrices.LoadMatrix("connect", fileInfo, iSpec, jSpec) + m2tr = numeric.transpose(matrices.LoadMatrix("connect", fileInfo, jSpec, iSpec)) connect2.append(m1 + m2tr) - B = LoadMatrixArray("B", fileInfo, seqsInfo, iSpec) - B_connect = MatricesAnd_s(connect2, B) + B = matrices.LoadMatrixArray("B", fileInfo, seqsInfo, iSpec) + B_connect = matrices.MatricesAnd_s(connect2, B) W = [b.sorted_indices().tolil() for b in B_connect] - for query in xrange(NumberOfSequences(seqsInfo, iSpec)): + for query in xrange(BlastFileProcessor.NumberOfSequences(seqsInfo, iSpec)): offset = seqsInfo.seqStartingIndices[iSpec] graphFile.write("%d " % (offset + query)) for jSpec in xrange(seqsInfo.nSpecies): @@ -817,9 +451,9 @@ class WaterfallMethod: Bij = BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, iSpecies, jSpecies) Bij = WaterfallMethod.NormaliseScores(Bij, Lengths, iSpecies, jSpecies) Bi.append(Bij) - DumpMatrixArray("B", Bi, fileInfo, iSpecies) - BH = BlastFileProcessor.GetBH_s(Bi, seqsInfo, iSpecies) - DumpMatrixArray("BH", BH, fileInfo, iSpecies) + matrices.DumpMatrixArray("B", Bi, fileInfo, iSpecies) + BH = GetBH_s(Bi, seqsInfo, iSpecies) + matrices.DumpMatrixArray("BH", BH, fileInfo, iSpecies) util.PrintTime("Initial processing of species %d complete" % iSpecies) @staticmethod @@ -834,12 +468,12 @@ class WaterfallMethod: @staticmethod def ConnectCognates(seqsInfo, fileInfo, iSpecies): # calculate RBH for species i - BHix = LoadMatrixArray("BH", fileInfo, seqsInfo, iSpecies) - BHxi = LoadMatrixArray("BH", fileInfo, seqsInfo, iSpecies, row=False) - RBHi = MatricesAndTr_s(BHix, BHxi) # twice as much work as before (only did upper triangular before) - B = LoadMatrixArray("B", fileInfo, seqsInfo, iSpecies) + BHix = matrices.LoadMatrixArray("BH", fileInfo, seqsInfo, iSpecies) + BHxi = matrices.LoadMatrixArray("BH", fileInfo, seqsInfo, iSpecies, row=False) + RBHi = matrices.MatricesAndTr_s(BHix, BHxi) # twice as much work as before (only did upper triangular before) + B = matrices.LoadMatrixArray("B", fileInfo, seqsInfo, iSpecies) connect = WaterfallMethod.ConnectAllBetterThanAnOrtholog_s(RBHi, B, seqsInfo, iSpecies) - DumpMatrixArray("connect", connect, fileInfo, iSpecies) + matrices.DumpMatrixArray("connect", connect, fileInfo, iSpecies) @staticmethod def Worker_ConnectCognates(cmd_queue): @@ -865,7 +499,7 @@ class WaterfallMethod: @staticmethod def GetMostDistant_s(RBH, B, seqsInfo, iSpec): - mostDistant = numeric.transpose(np.ones(NumberOfSequences(seqsInfo, iSpec))*1e9) + mostDistant = numeric.transpose(np.ones(BlastFileProcessor.NumberOfSequences(seqsInfo, iSpec))*1e9) for kSpec in xrange(seqsInfo.nSpecies): B[kSpec] = B[kSpec].tocsr() if iSpec == kSpec: @@ -878,7 +512,7 @@ class WaterfallMethod: @staticmethod def ConnectAllBetterThanCutoff_s(B, mostDistant, seqsInfo, iSpec): connect = [] - nSeqs_i = NumberOfSequences(seqsInfo, iSpec) + nSeqs_i = BlastFileProcessor.NumberOfSequences(seqsInfo, iSpec) for jSpec in xrange(seqsInfo.nSpecies): M=B[jSpec].tolil() if iSpec != jSpec: @@ -888,7 +522,7 @@ class WaterfallMethod: II = [i for (i, j) in IIJJ] JJ = [j for (i, j) in IIJJ] onesArray = np.ones(len(IIJJ)) - mat = sparse.csr_matrix( (onesArray, (II, JJ)), shape=(nSeqs_i, NumberOfSequences(seqsInfo, jSpec))) + mat = sparse.csr_matrix( (onesArray, (II, JJ)), shape=(nSeqs_i, BlastFileProcessor.NumberOfSequences(seqsInfo, jSpec))) connect.append(mat) return connect @@ -1078,20 +712,8 @@ OrthoFinder """ mclInflation = 1.5 -def CanRunCommand(command, qAllowStderr = False): - util.PrintNoNewLine("Test can run \"%s\"" % command) # print without newline - capture = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout = [x for x in capture.stdout] - stderr = [x for x in capture.stderr] - if len(stdout) > 0 and (qAllowStderr or len(stderr) == 0): - print(" - ok") - return True - else: - print(" - failed") - return False - def CanRunBLAST(): - if CanRunCommand("makeblastdb -help") and CanRunCommand("blastp -help"): + if util.CanRunCommand("makeblastdb -help") and util.CanRunCommand("blastp -help"): return True else: print("ERROR: Cannot run BLAST+") @@ -1100,17 +722,12 @@ def CanRunBLAST(): def CanRunMCL(): command = "mcl -h" - if CanRunCommand(command): + if util.CanRunCommand(command): return True else: print("ERROR: Cannot run MCL with the command \"%s\"" % command) print("Please check MCL is installed and in the system path\n") return False - -def PrintCitation(): - print("""\nWhen publishing work that uses OrthoFinder please cite: - D.M. Emms & S. Kelly (2015), OrthoFinder: solving fundamental biases in whole genome comparisons - dramatically improves orthogroup inference accuracy, Genome Biology 16:157.\n""") def PrintHelp(): print("Simple Usage") @@ -1149,7 +766,7 @@ def PrintHelp(): print("""-t number_of_blast_threads, --threads number_of_blast_threads The number of BLAST processes to be run simultaneously. This should be increased by the user to at least the number of cores on the computer so as to minimise the time taken to perform the BLAST all-versus-all - queries. [Default is %d]\n""" % nThreadsDefault) + queries. [Default is %d]\n""" % util.nThreadsDefault) print("""-a number_of_orthofinder_threads, --algthreads number_of_orthofinder_threads The number of threads to use for the OrthoFinder algorithm and MCL after BLAST searches have been completed. @@ -1157,7 +774,10 @@ def PrintHelp(): requirements proportionally so be aware of the amount of RAM you have available (and see README file). Additionally, as the algorithm implementation is very fast, file reading is likely to be the limiting factor above about 5-10 threads and additional threads may have little effect other than - increase RAM requirements. [Default is %d]\n""" % nAlgDefault) + increase RAM requirements. [Default is %d]\n""" % util.nAlgDefault) + + print("""-g, --groups + Only infer orthogroups, do not infer gene trees of orthologues.\n""") print("""-I inflation_parameter, --inflation inflation_parameter Specify a non-default inflation parameter for MCL. [Default is %0.1f]\n""" % mclInflation) @@ -1171,7 +791,7 @@ def PrintHelp(): print("""-h, --help Print this help text""") - PrintCitation() + util.PrintCitation() """ Main @@ -1181,7 +801,7 @@ Main def GetDirectoryArgument(arg, args): if len(args) == 0: print("Missing option for command line argument %s" % arg) - Fail() + util.Fail() directory = os.path.abspath(args.pop(0)) if directory[-1] != os.sep: directory += os.sep @@ -1228,10 +848,8 @@ def AssignIDsToSequences(fastaDirectory, outputDirectory): if len(originalFastaFilenames) > 0: outputFasta.close() return returnFilenames, originalFastaFilenames, idsFilename, speciesFilename, newSpeciesIDs, previousSpeciesIDs -if __name__ == "__main__": - import get_orthologues - - print("\nOrthoFinder version %s Copyright (C) 2014 David Emms\n" % version) +if __name__ == "__main__": + print("\nOrthoFinder version %s Copyright (C) 2014 David Emms\n" % util.version) print(""" This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it under certain conditions. For details please see the License.md that came with this software.\n""") @@ -1240,8 +858,8 @@ if __name__ == "__main__": sys.exit() # Control - nBlast = nThreadsDefault - nProcessAlg = nAlgDefault + nBlast = util.nThreadsDefault + nProcessAlg = util.nAlgDefault qUsePrecalculatedBlast = False # remove, just store BLAST to do qUseFastaFiles = False # local to argument checking qXML = False @@ -1264,70 +882,72 @@ if __name__ == "__main__": if arg == "-f" or arg == "--fasta": if qUseFastaFiles: print("Repeated argument: -f/--fasta") - Fail() + util.Fail() qUseFastaFiles = True fastaDir = GetDirectoryArgument(arg, args) elif arg == "-b" or arg == "--blast": if qUsePrecalculatedBlast: print("Repeated argument: -b/--blast") - Fail() + util.Fail() qUsePrecalculatedBlast = True workingDir_previous = GetDirectoryArgument(arg, args) elif arg == "-t" or arg == "--threads": if len(args) == 0: print("Missing option for command line argument -t") - Fail() + util.Fail() arg = args.pop(0) try: nBlast = int(arg) except: print("Incorrect argument for number of BLAST threads: %s" % arg) - Fail() + util.Fail() elif arg == "-a" or arg == "--algthreads": if len(args) == 0: print("Missing option for command line argument -a") - Fail() + util.Fail() arg = args.pop(0) try: nProcessAlg = int(arg) except: print("Incorrect argument for number of BLAST threads: %s" % arg) - Fail() + util.Fail() elif arg == "-I" or arg == "--inflation": if len(args) == 0: print("Missing option for command line argument -I") - Fail() + util.Fail() arg = args.pop(0) try: mclInflation = float(arg) except: print("Incorrect argument for MCL inflation parameter: %s" % arg) - Fail() + util.Fail() elif arg == "-x" or arg == "--orthoxml": if qXML: print("Repeated argument: -x/--orthoxml") - Fail() + util.Fail() qXML = True if len(args) == 0: print("Missing option for command line argument %s" % arg) - Fail() + util.Fail() speciesInfoFilename = args.pop(0) # elif arg == "-s" or arg == "--subset": # if qUseSubset: # print("Repeated argument: -s/--subset") -# Fail() +# util.Fail() # qUseSubset = True # qUsePrecalculatedBlast = True # workingDir_previous = GetDirectoryArgument(arg, args) elif arg == "-p" or arg == "--prepare": qOnlyPrepare = True qOrthologues = False + elif arg == "-g" or arg == "--groups": + qOrthologues = False elif arg == "-h" or arg == "--help": PrintHelp() sys.exit() else: print("Unrecognised argument: %s\n" % arg) - Fail() + util.Fail() # check argument combinations if qUseFastaFiles and qUsePrecalculatedBlast: @@ -1338,8 +958,8 @@ if __name__ == "__main__": speciesIdsFilename = workingDir_previous + "SpeciesIDs.txt" if not os.path.exists(speciesIdsFilename): print("%s file must be provided if using previously calculated BLAST results" % speciesIdsFilename) - Fail() - speciesToUse = GetSpeciesToUse(speciesIdsFilename) + util.Fail() + speciesToUse = util.GetSpeciesToUse(speciesIdsFilename) workingDir = os.path.abspath(workingDir) + os.sep if resultsDir == None: workingDir = workingDir_previous @@ -1350,13 +970,13 @@ if __name__ == "__main__": # check BLAST results directory exists if not os.path.exists(workingDir_previous): print("Previous/Pre-calculated BLAST results directory does not exist: %s\n" % workingDir_previous) - Fail() + util.Fail() # check fasta files are present previousFastaFiles = util.SortFastaFilenames(glob.glob(workingDir_previous + "Species*.fa")) if len(previousFastaFiles) == 0: print("No processed fasta files in the supplied previous working directory: %s\n" % workingDir_previous) - Fail() + util.Fail() tokens = previousFastaFiles[-1][:-3].split("Species") lastFastaNumberString = tokens[-1] iLastFasta = 0 @@ -1365,10 +985,10 @@ if __name__ == "__main__": iLastFasta = int(lastFastaNumberString) except: print("Filenames for processed fasta files are incorrect: %s\n" % previousFastaFiles[-1]) - Fail() + util.Fail() if nFasta != iLastFasta + 1: print("Not all expected fasta files are present. Index of last fasta file is %s but found %d fasta files.\n" % (lastFastaNumberString, len(previousFastaFiles))) - Fail() + util.Fail() # check BLAST files qHaveBlast = True @@ -1378,13 +998,13 @@ if __name__ == "__main__": if not os.path.exists(filename): print("BLAST results file is missing: %s" % filename) qHaveBlast = False - if not qHaveBlast: Fail() + if not qHaveBlast: util.Fail() # check SequenceIDs.txt and SpeciesIDs.txt files are present idsFilename = workingDir_previous + "SequenceIDs.txt" if not os.path.exists(idsFilename): print("%s file must be provided if using previous calculated BLAST results" % idsFilename) - Fail() + util.Fail() print("Using previously calculated BLAST results in %s" % workingDir_previous) else: # - create working directory @@ -1400,9 +1020,9 @@ if __name__ == "__main__": print("\n1. Checking required programs are installed") print("-------------------------------------------") if (not qUsePrecalculatedBlast) and (not CanRunBLAST()): - Fail() + util.Fail() if not CanRunMCL(): - Fail() + util.Fail() # - rename sequences with unique, simple identifiers print("\n2. Temporarily renaming sequences with unique, simple identifiers") @@ -1413,7 +1033,7 @@ if __name__ == "__main__": newFastaFiles, userFastaFilenames, idsFilename, speciesIdsFilename, newSpeciesIDs, previousSpeciesIDs = AssignIDsToSequences(fastaDir, workingDir) speciesToUse = speciesToUse + newSpeciesIDs print("Done!") - seqsInfo = GetSeqsInfo(workingDir_previous if qUsePrecalculatedBlast else workingDir, speciesToUse) + seqsInfo = util.GetSeqsInfo(workingDir_previous if qUsePrecalculatedBlast else workingDir, speciesToUse) if qXML: print("\n2b. Reading species information file") @@ -1437,7 +1057,7 @@ if __name__ == "__main__": print("Each line should contain 5 tab-delimited fields:") print(" fastaFilename, speciesName, NCBITaxID, sourceDatabaseName, databaseFastaFilename") print("See README file for more information.") - Fail() + util.Fail() fastaFilename, speciesName, NCBITaxID, sourceDatabaseName, databaseVersionFastaFile = line try: iSpecies = fastaFileIndices[fastaFilename] @@ -1448,7 +1068,7 @@ if __name__ == "__main__": for filename in userFastaFilenames_justNames: print(filename) print("Please provide information for each of these species in the species information file") - Fail() + util.Fail() speciesInfo[iSpecies] = line # check information has been provided for all species speciesMissing = False @@ -1461,7 +1081,7 @@ if __name__ == "__main__": speciesMissing = True print(fastaFilename) if speciesMissing: - Fail() + util.Fail() print("\n3. Dividing up work for BLAST for parallel processing") print( "-----------------------------------------------------") @@ -1496,9 +1116,9 @@ if __name__ == "__main__": cmd_queue = mp.Queue() for iCmd, cmd in enumerate(commands): cmd_queue.put((iCmd+1, cmd)) - runningProcesses = [mp.Process(target=Worker_RunCommand, args=(cmd_queue, len(commands))) for i_ in xrange(nBlast)] + runningProcesses = [mp.Process(target=util.Worker_RunCommand, args=(cmd_queue, nBlast, len(commands))) for i_ in xrange(nBlast)] for proc in runningProcesses: - proc.start() + proc.start()# for proc in runningProcesses: while proc.is_alive(): proc.join() @@ -1511,13 +1131,13 @@ if __name__ == "__main__": # Run Algorithm, cluster and output cluster files with original accessions print("\n5. Running OrthoFinder algorithm") print( "--------------------------------") - fileIdentifierString = "OrthoFinder_v%s" % version + fileIdentifierString = "OrthoFinder_v%s" % util.version graphFilename = workingDir + "%s_graph.txt" % fileIdentifierString # it's important to free up the memory from python used for processing the genomes # before launching MCL becuase both use sizeable ammounts of memory. The only # way I can find to do this is to launch the memory intensive python code # as separate process that exitsbefore MCL is launched. - fileInfo = FileInfo(inputDir=workingDir_previous if qUsePrecalculatedBlast else workingDir, outputDir = workingDir, graphFilename=graphFilename) + fileInfo = util.FileInfo(inputDir=workingDir_previous if qUsePrecalculatedBlast else workingDir, outputDir = workingDir, graphFilename=graphFilename) if not os.path.exists(fileInfo.outputDir): os.mkdir(fileInfo.outputDir) Lengths = GetSequenceLengths(seqsInfo, fileInfo) @@ -1530,7 +1150,7 @@ if __name__ == "__main__": runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, )) for i_ in xrange(nProcessAlg)] for proc in runningProcesses: proc.start() - ManageQueue(runningProcesses, cmd_queue) + util.ManageQueue(runningProcesses, cmd_queue) cmd_queue = mp.Queue() for iSpecies in xrange(seqsInfo.nSpecies): @@ -1538,7 +1158,7 @@ if __name__ == "__main__": runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ConnectCognates, args=(cmd_queue, )) for i_ in xrange(nProcessAlg)] for proc in runningProcesses: proc.start() - ManageQueue(runningProcesses, cmd_queue) + util.ManageQueue(runningProcesses, cmd_queue) util.PrintTime("Connected putatitive homologs") WaterfallMethod.WriteGraphParallel(seqsInfo, fileInfo) @@ -1547,12 +1167,12 @@ if __name__ == "__main__": clustersFilename, iResultsVersion = util.GetUnusedFilename(workingDir + "clusters_%s_I%0.1f" % (fileIdentifierString, mclInflation), ".txt") MCL.RunMCL(graphFilename, clustersFilename, nProcessAlg, mclInflation) clustersFilename_pairs = clustersFilename + "_id_pairs.txt" - MCL.ConvertSingleIDsToIDPair(seqsInfo, clustersFilename, clustersFilename_pairs) + MCLread.ConvertSingleIDsToIDPair(seqsInfo, clustersFilename, clustersFilename_pairs) print("\n6. Creating files for Orthologous Groups") print( "----------------------------------------") - if not qOrthologues: PrintCitation() - ogs = MCL.GetPredictedOGs(clustersFilename_pairs) + if not qOrthologues: util.PrintCitation() + ogs = MCLread.GetPredictedOGs(clustersFilename_pairs) resultsBaseFilename = util.GetUnusedFilename(resultsDir + "OrthologousGroups", ".csv")[:-4] # remove .csv from base filename resultsBaseFilename = resultsDir + "OrthologousGroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) idsDict = MCL.WriteOrthogroupFiles(ogs, [idsFilename], resultsBaseFilename, clustersFilename_pairs) @@ -1575,4 +1195,4 @@ if __name__ == "__main__": print(statsFile) print("") print(summaryText) - PrintCitation() + util.PrintCitation() diff --git a/orthofinder/scripts/__init__.py b/orthofinder/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/orthofinder/scripts/__init__.py @@ -0,0 +1 @@ + diff --git a/orthofinder/scripts/blast_file_processor.py b/orthofinder/scripts/blast_file_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..ad556f4258828444f8682e37104434f6a8f50d03 --- /dev/null +++ b/orthofinder/scripts/blast_file_processor.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.com + +import sys +import csv +from scipy import sparse + +def NumberOfSequences(seqsInfo, iSpecies): + return (seqsInfo.seqStartingIndices[iSpecies+1] if iSpecies != seqsInfo.nSpecies-1 else seqsInfo.nSeqs) - seqsInfo.seqStartingIndices[iSpecies] + +def GetBLAST6Scores(seqsInfo, fileInfo, iSpecies, jSpecies, qExcludeSelfHits = True, sep = "_"): + nSeqs_i = NumberOfSequences(seqsInfo, iSpecies) + nSeqs_j = NumberOfSequences(seqsInfo, jSpecies) + B = sparse.lil_matrix((nSeqs_i, nSeqs_j)) + row = "" + try: + with open(fileInfo.inputDir + "Blast%d_%d.txt" % (seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies]), 'rb') as blastfile: + blastreader = csv.reader(blastfile, delimiter='\t') + for row in blastreader: + # Get hit and query IDs + try: + species1ID, sequence1ID = map(int, row[0].split(sep, 1)) + species2ID, sequence2ID = map(int, row[1].split(sep, 1)) + except (IndexError, ValueError): + sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n") + raise + # Get bit score for pair + try: + score = float(row[11]) + except (IndexError, ValueError): + sys.stderr.write("\nERROR: 12th field in BLAST results file line should be the bit-score for the hit\n") + raise + if (qExcludeSelfHits and species1ID == species2ID and sequence1ID == sequence2ID): + continue + # store bit score + try: + if score > B[sequence1ID, sequence2ID]: + B[sequence1ID, sequence2ID] = score + except IndexError: + def ord(n): + return str(n)+("th" if 4<=n%100<=20 else {1:"st",2:"nd",3:"rd"}.get(n%10, "th")) +# sys.stderr.write("\nError in input files, expected only %d sequences in species %d and %d sequences in species %d but found a hit in the Blast%d_%d.txt between sequence %d_%d (i.e. %s sequence in species) and sequence %d_%d (i.e. %s sequence in species)\n" % (nSeqs_i, iSpecies, nSeqs_j, jSpecies, iSpecies, jSpecies, iSpecies, sequence1ID, ord(sequence1ID+1), jSpecies, sequence2ID, ord(sequence2ID+1))) + sys.stderr.write("\nERROR: Inconsistent input files.\n") + kSpecies, nSeqs_k, sequencekID = (iSpecies, nSeqs_i, sequence1ID) if sequence1ID >= nSeqs_i else (jSpecies, nSeqs_j, sequence2ID) + sys.stderr.write("Species%d.fa contains only %d sequences " % (kSpecies, nSeqs_k)) + sys.stderr.write("but found a query/hit in the Blast%d_%d.txt for sequence %d_%d (i.e. %s sequence in species %d).\n" % (iSpecies, jSpecies, kSpecies, sequencekID, ord(sequencekID+1), kSpecies)) + sys.exit() + except Exception: + sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (fileInfo.inputDir, seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies])) + sys.stderr.write("\t".join(row) + "\n") + sys.exit() + return B \ No newline at end of file diff --git a/get_orthologues.py b/orthofinder/scripts/get_orthologues.py similarity index 87% rename from get_orthologues.py rename to orthofinder/scripts/get_orthologues.py index ac7a1e7c04c060bf628cacaf6e19e30f23b62208..82e32d76f1ad5955bb54caa96aa10c0118641ebf 100755 --- a/get_orthologues.py +++ b/orthofinder/scripts/get_orthologues.py @@ -1,13 +1,33 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -""" -Created on Thu Jun 9 16:00:33 2016 - -@author: david -""" +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.comhor: david -import sys import os +import sys +import glob import subprocess import numpy as np from collections import Counter, defaultdict @@ -16,15 +36,15 @@ import itertools import multiprocessing as mp import Queue +import util import tree -import trees_for_orthogroups as tfo -import orthofinder +import matrices +import mcl as MCL import root_from_duplications as rfd -import process_trees as pt - -#print(orthofinder.__file__) +import orthologues_from_recon_trees as pt +import blast_file_processor as BlastFileProcessor -nThreads = orthofinder.nThreadsDefault +nThreads = util.nThreadsDefault class Seq(object): def __init__(self, seqInput): @@ -61,21 +81,19 @@ class Seq(object): # ============================================================================================================================== class OrthoGroupsSet(object): - def __init__(self, orthofinderWorkingDir, clustersFilename_pairs, idExtractor = orthofinder.FirstWordExtractor): + def __init__(self, orthofinderWorkingDir, clustersFilename_pairs, idExtractor = util.FirstWordExtractor): self.workingDirOF = orthofinderWorkingDir self.seqIDsFN = orthofinderWorkingDir + "SequenceIDs.txt" self.speciesIDsFN = orthofinderWorkingDir + "SpeciesIDs.txt" - self.speciesIDsEx = orthofinder.FullAccession(self.speciesIDsFN) + self.speciesIDsEx = util.FullAccession(self.speciesIDsFN) self._Spec_SeqIDs = None self._extractor = idExtractor self.clustersFN = clustersFilename_pairs self.seqIDsEx = None self.ogs = self.OGs() - self.speciesToUse = orthofinder.GetSpeciesToUse(self.speciesIDsFN) - self.seqsInfo = orthofinder.GetSeqsInfo(orthofinderWorkingDir, self.speciesToUse) - self.fileInfo = orthofinder.FileInfo(inputDir=orthofinderWorkingDir, outputDir = orthofinderWorkingDir, graphFilename="") -# nSeqs, nSpecies, speciesStartingIndices = orthofinder.BlastFileProcessor.GetNumberOfSequencesInFileFromDir(workingDir, self.speciesToUse) -# self.bfp = orthofinder.BlastFileProcessor(workingDir, self.speciesToUse, nSeqs, nSpecies, speciesStartingIndices) + self.speciesToUse = util.GetSpeciesToUse(self.speciesIDsFN) + self.seqsInfo = util.GetSeqsInfo(orthofinderWorkingDir, self.speciesToUse) + self.fileInfo = util.FileInfo(inputDir=orthofinderWorkingDir, outputDir = orthofinderWorkingDir, graphFilename="") def SequenceDict(self): if self.seqIDsEx == None: @@ -103,15 +121,9 @@ class OrthoGroupsSet(object): return self._Spec_SeqIDs def OGs(self): - ogs = orthofinder.MCL.GetPredictedOGs(self.clustersFN) + ogs = MCL.GetPredictedOGs(self.clustersFN) ogs = [[Seq(g) for g in og] for og in ogs if len(og) >= 4] return ogs - -# def BitScores(self, iSp, jSp): -## return self.bfp.GetBLAST6Scores(self.speciesToUse.index(iSp), self.speciesToUse.index(jSp), False) -# return orthofinder.BlastFileProcessor.GetBLAST6Scores(self.seqsInfo, self.fileInfo, iSp, jSp, False) -## with open(self.workingDir + "B%d_%d.pic" % (iSp, jSp), 'rb') as infile: -## return pic.load(infile) # ============================================================================================================================== @@ -193,10 +205,10 @@ def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo): i, args = cmd_queue.get(True, 1) nDone = i - nProcesses + 1 if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: - orthofinder.util.PrintTime("Done %d of %d" % (nDone, nToDo)) - B = orthofinder.BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False) + util.PrintTime("Done %d of %d" % (nDone, nToDo)) + B = BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False) with open(fileInfo.outputDir + "Bit%d_%d.pic" % args, 'wb') as outfile: - pic.dump(B, outfile, protocol = orthofinder.picProtocol) + pic.dump(B, outfile, protocol = util.picProtocol) except Queue.Empty: return @@ -250,8 +262,8 @@ class DendroBLASTTrees(object): nSeqs = self.NumberOfSequences(self.species) ogMatrices = [np.zeros((n, n)) for n in nGenes] for iiSp, sp1 in enumerate(self.species): - orthofinder.util.PrintTime("Processing species %d" % sp1) - Bs = [orthofinder.LoadMatrix("Bit", self.ogSet.fileInfo, iiSp, jjSp) for jjSp in xrange(len(self.species))] + util.PrintTime("Processing species %d" % sp1) + Bs = [matrices.LoadMatrix("Bit", self.ogSet.fileInfo, iiSp, jjSp) for jjSp in xrange(len(self.species))] mins = np.ones((nSeqs[sp1], 1), dtype=np.float64)*9e99 maxes = np.zeros((nSeqs[sp1], 1), dtype=np.float64) for B, sp2 in zip(Bs, self.species): @@ -264,6 +276,10 @@ class DendroBLASTTrees(object): m[i, j] = 0.5*max(B[gi.iSeq, gj.iSeq], mins[gi.iSeq]) / maxes[gi.iSeq] return ogs, ogMatrices + def DeleteBlastMatrices(self, workingDir): + for f in glob.glob(workingDir + "B*_*.pic"): + if os.path.exists(f): os.remove(f) + def WriteOGMatrices(self, ogs, ogMatrices): newMatrices = [] for iog, (og, m) in enumerate(zip(ogs, ogMatrices)): @@ -347,12 +363,13 @@ class DendroBLASTTrees(object): def RunAnalysis(self): ogs, ogMatrices_partial = self.GetOGMatrices() ogMatrices = self.WriteOGMatrices(ogs, ogMatrices_partial) + D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_geneTrees = self.PrepareGeneTreeCommand() print("\n3. Inferring gene and species trees") print( "-----------------------------------") - tfo.RunParallelCommandSets(self.nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout = True) + util.RunParallelOrderedCommandLists(self.nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout = True) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.ogs)): self.RenameTreeTaxa(self.treesPatIDs % iog, self.treesPat % iog, seqDict, qFixNegatives=True) @@ -440,31 +457,33 @@ def RunDlcpar(treesPat, ogSet, nOGs, speciesTreeFN, workingDir): dlcCommands = ['dlcpar_search -s %s -S %s -D 1 -C 0.125 %s -O %s' % (speciesTreeFN, geneMapFN, fn, dlcparResultsDir + os.path.splitext(os.path.split(fn)[1])[0]) for fn in filenames] # print(dlcCommands[0]) # use this to run in parallel - tfo.RunParallelCommandSets(nThreads, [[c] for c in dlcCommands], qHideStdout = True) + util.RunParallelOrderedCommandLists(nThreads, [[c] for c in dlcCommands], qHideStdout = True) return dlcparResultsDir # ============================================================================================================================== # Main -def WriteTestDistancesFile(workingDir): - testFN = workingDir + "SimpleTest.phy" +def WriteTestDistancesFile(testFN): with open(testFN, 'wb') as outfile: outfile.write("4\n1_1 0 0 0.2 0.25\n0_2 0 0 0.21 0.28\n3_1 0.21 0.21 0 0\n4_1 0.25 0.28 0 0") return testFN def CanRunDependencies(workingDir): # FastME - testFN = WriteTestDistancesFile(workingDir) + testFN = workingDir + "SimpleTest.phy" + WriteTestDistancesFile(testFN) outFN = workingDir + "SimpleTest.tre" if os.path.exists(outFN): os.remove(outFN) - if not orthofinder.CanRunCommand("fastme -i %s -o %s" % (testFN, outFN), qAllowStderr=False): + if not util.CanRunCommand("fastme -i %s -o %s" % (testFN, outFN), qAllowStderr=False): print("ERROR: Cannot run fastme") print("Please check FastME is installed and that the executables are in the system path\n") return False os.remove(testFN) os.remove(outFN) + fastme_stat_fn = workingDir + "SimpleTest.phy_fastme_stat.txt" + if os.path.exists(fastme_stat_fn): os.remove(fastme_stat_fn) # DLCPar - if not orthofinder.CanRunCommand("dlcpar_search --version", qAllowStderr=False): + if not util.CanRunCommand("dlcpar_search --version", qAllowStderr=False): print("ERROR: Cannot run dlcpar_search") print("Please check DLCpar is installed and that the executables are in the system path\n") return False @@ -484,11 +503,11 @@ def PrintHelp(): print("""-t max_number_of_threads, --threads max_number_of_threads The maximum number of processes to be run simultaneously. The deafult is %d but this - should be increased by the user to the maximum number of cores available.\n""" % orthofinder.nThreadsDefault) + should be increased by the user to the maximum number of cores available.\n""" % util.nThreadsDefault) print("""-h, --help Print this help text""") - orthofinder.PrintCitation() + util.PrintCitation() def GetResultsFilesString(rootedSpeciesTreeFN): st = "" @@ -509,18 +528,18 @@ def GetResultsFilesString(rootedSpeciesTreeFN): def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs, nProcesses): - ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = orthofinder.FirstWordExtractor) + ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = util.FirstWordExtractor) if len(ogSet.speciesToUse) < 4: print("ERROR: Not enough species to infer species tree") - orthofinder.Fail() + util.Fail() print("\n1. Checking required programs are installed") print( "-------------------------------------------") - if not CanRunDependencies(orthofinderWorkingDir): orthofinder.Fail() + if not CanRunDependencies(orthofinderWorkingDir): util.Fail() print("\n2. Reading sequence similarity scores") print( "-------------------------------------") - resultsDir = orthofinder.util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") + resultsDir = util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) db.ReadAndPickle() @@ -580,13 +599,13 @@ if __name__ == "__main__": if arg == "-t" or arg == "--threads": if len(args) == 0: print("Missing option for command line argument -t") - orthofinder.Fail() + util.Fail() arg = args.pop(0) try: nProcesses = int(arg) except: print("Incorrect argument for number of threads: %s" % arg) - orthofinder.Fail() + util.Fail() else: userDir = arg @@ -596,13 +615,13 @@ if __name__ == "__main__": if nProcesses == None: print("""\nNumber of parallel processes has not been specified, will use the default value. Number of parallel processes can be specified using the -t option.""") - nProcesses = orthofinder.nThreadsDefault + nProcesses = util.nThreadsDefault print("Using %d threads for alignments and trees" % nProcesses) - orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = tfo.GetOGsFile(userDir) + orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(userDir) resultsString = GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs, nProcesses) print(resultsString) - orthofinder.PrintCitation() + util.PrintCitation() diff --git a/orthofinder/scripts/matrices.py b/orthofinder/scripts/matrices.py new file mode 100644 index 0000000000000000000000000000000000000000..0df906a60c0c03ba6af4ce8cda6e9a916182a905 --- /dev/null +++ b/orthofinder/scripts/matrices.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.com + +import cPickle as pic + +import util + +def DumpMatrix(name, m, fileInfo, iSpecies, jSpecies): + with open(fileInfo.outputDir + "%s%d_%d.pic" % (name, iSpecies, jSpecies), 'wb') as picFile: + pic.dump(m, picFile, protocol=util.picProtocol) + +def DumpMatrixArray(name, matrixArray, fileInfo, iSpecies): + for jSpecies, m in enumerate(matrixArray): + DumpMatrix(name, m, fileInfo, iSpecies, jSpecies) + +def LoadMatrix(name, fileInfo, iSpecies, jSpecies): + with open(fileInfo.outputDir + "%s%d_%d.pic" % (name, iSpecies, jSpecies), 'rb') as picFile: + M = pic.load(picFile) + return M + +def LoadMatrixArray(name, fileInfo, seqsInfo, iSpecies, row=True): + matrixArray = [] + for jSpecies in xrange(seqsInfo.nSpecies): + if row == True: + matrixArray.append(LoadMatrix(name, fileInfo, iSpecies, jSpecies)) + else: + matrixArray.append(LoadMatrix(name, fileInfo, jSpecies, iSpecies)) + return matrixArray + +def MatricesAnd_s(Xarr, Yarr): + Zarr = [] + for x, y in zip(Xarr, Yarr): + Zarr.append(x.multiply(y)) + return Zarr + +def MatricesAndTr_s(Xarr, Yarr): + Zarr = [] + for x, y in zip(Xarr, Yarr): + Zarr.append(x.multiply(y.transpose())) + return Zarr diff --git a/orthofinder/scripts/mcl.py b/orthofinder/scripts/mcl.py new file mode 100644 index 0000000000000000000000000000000000000000..19b09e6e54ab55b463570ea1c80531166570d4ca --- /dev/null +++ b/orthofinder/scripts/mcl.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.com + +def GetPredictedOGs(clustersFilename): + predictedOGs = [] + nOGsString = "" + qContainsProfiles = False + with open(clustersFilename, 'rb') as clusterFile: + header = True + og = set() + for line in clusterFile: + if header: + if line.count("begin"): + header = False + else: + if line.find(")") != -1: + break + if line[-2] == "$": + line = line[:-3] + if line[0] == " ": + # continuation of group + x = line.split() + y = [x_ for x_ in x if not x_.startswith('Prof')] + og = og.union(y) + else: + # new OG + if len(og) != 0: + predictedOGs.append(og) + nOGsString, line = line.split(" ", 1) + x = line.split() + y = [x_ for x_ in x if not x_.startswith('Prof')] + if len(x) != len(y): + qContainsProfiles = True + og = set(y) + if len(og) > 0: + predictedOGs.append(og) + if not qContainsProfiles: + assert(len(predictedOGs) == int(nOGsString) + 1) + return predictedOGs + +def GetSingleID(speciesStartingIndices, seq, speciesToUse): + iSpecies, iSeq = map(int, seq.split("_")) + offset = speciesStartingIndices[speciesToUse.index(iSpecies)] + return iSeq + offset + +def GetIDPair(speciesStartingIndices, singleID, speciesToUse): + for i, startingIndex in enumerate(speciesStartingIndices): + if startingIndex > singleID: + return "%d_%d" % (speciesToUse[i-1], singleID - speciesStartingIndices[i-1]) + return "%d_%d" % (speciesToUse[-1], singleID - speciesStartingIndices[len(speciesStartingIndices)-1]) + +def ConvertSingleIDsToIDPair(seqsInfo, clustersFilename, newFilename): + with open(clustersFilename, 'rb') as clusterFile, open(newFilename, "wb") as output: + header = True + for line in clusterFile: + appendDollar = False + initialText = "" + idsString = "" + ids = [] + if header: + output.write(line) + if line.count("begin"): + header = False + else: + if line.find(")") != -1: + output.write(line) + break + if line[-2] == "$": + line = line[:-3] + appendDollar = True + if line[0] != " ": + initialText, line = line.split(None, 1) + # continuation of group + ids = line.split() + for id in ids: + idsString += GetIDPair(seqsInfo.seqStartingIndices, int(id), seqsInfo.speciesToUse) + " " + output.write(initialText + " " + idsString) + if appendDollar: + output.write("$\n") + else: + output.write("\n") + diff --git a/newick.py b/orthofinder/scripts/newick.py similarity index 100% rename from newick.py rename to orthofinder/scripts/newick.py diff --git a/process_trees.py b/orthofinder/scripts/orthologues_from_recon_trees.py similarity index 83% rename from process_trees.py rename to orthofinder/scripts/orthologues_from_recon_trees.py index 705f6c67ed64b5efa254934590a7a186cf8ffccb..840eff48364d6715d3542e79646c557e0261f43c 100644 --- a/process_trees.py +++ b/orthofinder/scripts/orthologues_from_recon_trees.py @@ -1,6 +1,31 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.com + import re import os -import sys import csv import glob import itertools @@ -10,7 +35,7 @@ import tree from scipy import sparse from collections import defaultdict -import orthofinder +import util def natural_sort_key(s, _nsre=re.compile('([0-9]+)')): return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)] @@ -43,8 +68,8 @@ def make_dicts(dlcparResultsDir, outputDir): return Orthologs, picFN def GetSpeciesGenesInfo(ogSet): - speciesLabels = orthofinder.GetSpeciesToUse(ogSet.speciesIDsFN) - seqsInfo = orthofinder.GetSeqsInfo(ogSet.workingDirOF, speciesLabels) + speciesLabels = util.GetSpeciesToUse(ogSet.speciesIDsFN) + seqsInfo = util.GetSeqsInfo(ogSet.workingDirOF, speciesLabels) genenumbers = list(np.diff(seqsInfo.seqStartingIndices)) genenumbers.append(seqsInfo.nSeqs - seqsInfo.seqStartingIndices[-1]) return speciesLabels, genenumbers @@ -54,7 +79,7 @@ def one_to_one_efficient(orthodict, genenumbers, speciesLabels, iSpecies, output try to mostly deal with iSpecies which is the ordinal number not the label it is given """ #Creates all matrices and appends them to matrixlist. - orthofinder.util.PrintTime("Processing orthologues for species %d" % iSpecies) + util.PrintTime("Processing orthologues for species %d" % iSpecies) matrixlist = [] numspecies = len(speciesLabels) speciesLabelsReverse = {label:i for i, label in enumerate(speciesLabels)} @@ -141,17 +166,4 @@ def get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, workingDir): # -> csv files species_find_all(ogSet.SpeciesDict(), ogSet.SequenceDict(), matrixDir, resultsDir) - -if __name__ == "__main__": - import get_orthologues as go - d = "/home/david/projects/Orthology/OrthoFinder/Development/Orthologues/ExampleDataset_Results/" - orthofinderWorkingDir = d + "WorkingDirectory/" - clustersFilename_pairs = orthofinderWorkingDir + "clusters_OrthoFinder_v0.7.0_I1.5.txt_id_pairs.txt" - resultsDir = d + "Orthologues_Aug18_test2/" - dlcparResultsDir = resultsDir + "/WorkingDirectory/dlcpar/" - workingDir = resultsDir + "/WorkingDirectory/" - - ogSet = go.OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = orthofinder.FirstWordExtractor) - - get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, workingDir) diff --git a/root_from_duplications.py b/orthofinder/scripts/root_from_duplications.py similarity index 93% rename from root_from_duplications.py rename to orthofinder/scripts/root_from_duplications.py index 05e91ff17edad1506ae659cd774e795788c23117..370c8f2df7cff4ca261bfae082cab3f961d0fa62 100644 --- a/root_from_duplications.py +++ b/orthofinder/scripts/root_from_duplications.py @@ -1,16 +1,30 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- -""" -Created on Tue Jul 26 14:34:46 2016 - -@author: david -""" +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.com -""" -Performance improvements: -- Use an unrooted tree representation so that the trees don't need to be rerooted at all -- convert all the gene names to species names once per tree - -""" import os import glob #import cProfile as profile @@ -21,6 +35,8 @@ import itertools import multiprocessing as mp from collections import Counter +import util + def cmp_equal(exp, act): """exp - expected set of species act - actual set of species @@ -43,8 +59,7 @@ criteria = "crit_all" spTreeFormat = 3 qSerial = False -nProcs = 64 -#nProcs = 16 +nProcs = util.nThreadsDefault class Node(object): """ The class allows the user to get the 'child' nodes in any of the three directions @@ -404,7 +419,6 @@ if __name__ == "__main__": parser.add_argument("-s", "--separator", choices=("dot", "dash", "second_dash", "3rd_dash")) parser.add_argument("-S", "--Species_tree") parser.add_argument("-d", "--directory", action="store_true") - parser.add_argument("-v", "--verbose", action="store_true") args = parser.parse_args() GeneToSpecies = GeneToSpecies_dash @@ -441,10 +455,8 @@ if __name__ == "__main__": print("Best outgroup(s) for species tree:") for r in roots: print(r) - if args.verbose: PlotTree(speciesTree, args.input_tree, clusters) else: root, clusters, _, nSupport = GetRoot(args.Species_tree, args.input_tree, GeneToSpecies, nProcs, treeFmt = 1) for r in root: print(r) speciesTree = tree.Tree(args.Species_tree, format=1) - if args.verbose: PlotTree(speciesTree, args.input_tree, clusters, qSimplePrint=False) diff --git a/tree.py b/orthofinder/scripts/tree.py similarity index 99% rename from tree.py rename to orthofinder/scripts/tree.py index 489b221374b6337575fd14e7b67edc4301702db8..1364bc83aca1f26ec9c41177652c12c76ed18623 100644 --- a/tree.py +++ b/orthofinder/scripts/tree.py @@ -37,7 +37,6 @@ # # #END_LICENSE############################################################# -import os import cPickle import random import copy @@ -159,20 +158,6 @@ class TreeNode(object): #: A list of children nodes children = property(fget=_get_children, fset=_set_children) - def _set_face_areas(self, value): - if isinstance(value, _FaceAreas): - self._faces = value - else: - raise ValueError("[%s] is not a valid FaceAreas instance" %type(value)) - - def _get_face_areas(self): - if not hasattr(self, "_faces"): - self._faces = _FaceAreas() - return self._faces - - faces = property(fget=_get_face_areas, \ - fset=_set_face_areas) - def __init__(self, newick=None, format=0, dist=None, support=None, name=None): self._children = [] diff --git a/orthofinder/scripts/util.py b/orthofinder/scripts/util.py new file mode 100644 index 0000000000000000000000000000000000000000..cd8a1f3ca8107eb0fb8c19786f73889498ab596a --- /dev/null +++ b/orthofinder/scripts/util.py @@ -0,0 +1,388 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.com + +nThreadsDefault = 16 +nAlgDefault = 1 + +import os +import sys +import glob +import time +import subprocess +import datetime +import Queue +import multiprocessing as mp +from collections import namedtuple + + +""" +Utilities +------------------------------------------------------------------------------- +""" +SequencesInfo = namedtuple("SequencesInfo", "nSeqs nSpecies speciesToUse seqStartingIndices") +FileInfo = namedtuple("FileInfo", "inputDir outputDir graphFilename") + +picProtocol = 1 +version = "1.0.1" + +def PrintNoNewLine(text): + sys.stdout.write(text) + +def PrintTime(message): + print(str(datetime.datetime.now()).rsplit(".", 1)[0] + " : " + message) + +""" +Command & parallel command management +------------------------------------------------------------------------------- +""" + +def RunCommand(command): + subprocess.call(command) + +def RunOrderedCommandList(commandSet, qHideStdout): + if qHideStdout: + for cmd in commandSet: + subprocess.call(cmd, shell=True, stdout=subprocess.PIPE) + else: + for cmd in commandSet: + subprocess.call(cmd, shell=True) + +def CanRunCommand(command, qAllowStderr = False): + PrintNoNewLine("Test can run \"%s\"" % command) # print without newline + capture = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout = [x for x in capture.stdout] + stderr = [x for x in capture.stderr] + if len(stdout) > 0 and (qAllowStderr or len(stderr) == 0): + print(" - ok") + return True + else: + print(" - failed") + return False + +def Worker_RunCommand(cmd_queue, nProcesses, nToDo): + while True: + try: + i, command = cmd_queue.get(True, 1) + nDone = i - nProcesses + 1 + if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: + PrintTime("Done %d of %d" % (nDone, nToDo)) + subprocess.call(command) + except Queue.Empty: + return + +def Worker_RunOrderedCommandList(cmd_queue, nProcesses, nToDo, qHideStdout): + """ repeatedly takes items to process from the queue until it is empty at which point it returns. Does not take a new task + if it can't acquire queueLock as this indicates the queue is being rearranged. + + Writes each commands output and stderr to a file + """ + while True: + try: + i, commandSet = cmd_queue.get(True, 1) + nDone = i - nProcesses + 1 + if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: + PrintTime("Done %d of %d" % (nDone, nToDo)) + RunOrderedCommandList(commandSet, qHideStdout) + except Queue.Empty: + return + +def RunParallelCommands(nProcesses, commands, qHideStdout = False): + """nProcesss - the number of processes to run in parallel + commands - list of commands to be run in parallel + """ + # Setup the workers and run + cmd_queue = mp.Queue() + for i, cmd in enumerate(commands): + cmd_queue.put((i, cmd)) + runningProcesses = [mp.Process(target=Worker_RunCommand, args=(cmd_queue, nProcesses, i+1, qHideStdout)) for i_ in xrange(nProcesses)] + for proc in runningProcesses: + proc.start() + + for proc in runningProcesses: + while proc.is_alive(): + proc.join(10.) + time.sleep(2) + +def RunParallelOrderedCommandLists(nProcesses, commands, qHideStdout = False): + """nProcesss - the number of processes to run in parallel + commands - list of lists of commands where the commands in the inner list are completed in order (the i_th won't run until + the i-1_th has finished). + """ + # Setup the workers and run + cmd_queue = mp.Queue() + for i, cmd in enumerate(commands): + cmd_queue.put((i, cmd)) + runningProcesses = [mp.Process(target=Worker_RunOrderedCommandList, args=(cmd_queue, nProcesses, i+1, qHideStdout)) for i_ in xrange(nProcesses)] + for proc in runningProcesses: + proc.start() + + for proc in runningProcesses: + while proc.is_alive(): + proc.join(10.) + time.sleep(2) + + +def ManageQueue(runningProcesses, cmd_queue): + """Manage a set of runningProcesses working through cmd_queue. + If there is an error the exit all processes as quickly as possible and + exit via Fail() methods. Otherwise return when all work is complete + """ + # set all completed processes to None + qError = False +# dones = [False for _ in runningProcesses] + nProcesses = len(runningProcesses) + while True: + if runningProcesses.count(None) == len(runningProcesses): break + time.sleep(2) +# for proc in runningProcesses: + for i in xrange(nProcesses): + proc = runningProcesses[i] + if proc == None: continue + if not proc.is_alive(): + if proc.exitcode != 0: + qError = True + while True: + try: + cmd_queue.get(True, 1) + except Queue.Empty: + break + runningProcesses[i] = None + if qError: + Fail() +""" +Directory and file management +------------------------------------------------------------------------------- +""" + +def GetDirectoryName(baseDirName, dateString, i): + if i == 0: + return baseDirName + dateString + os.sep + else: + return baseDirName + dateString + ("_%d" % i) + os.sep + +"""Call GetNameForNewWorkingDirectory before a call to CreateNewWorkingDirectory to find out what directory will be created""" +def CreateNewWorkingDirectory(baseDirectoryName): + dateStr = datetime.date.today().strftime("%b%d") + iAppend = 0 + newDirectoryName = GetDirectoryName(baseDirectoryName, dateStr, iAppend) + while os.path.exists(newDirectoryName): + iAppend += 1 + newDirectoryName = GetDirectoryName(baseDirectoryName, dateStr, iAppend) + os.mkdir(newDirectoryName) + return newDirectoryName + +def GetUnusedFilename(baseFilename, ext): + iAppend = 0 + newFilename = baseFilename + ext + while os.path.exists(newFilename): + iAppend += 1 + newFilename = baseFilename + ("_%d" % iAppend) + ext + return newFilename, iAppend + +def SortArrayPairByFirst(useForSortAr, keepAlignedAr, qLargestFirst=False): + sortedTuples = sorted(zip(useForSortAr, keepAlignedAr), reverse=qLargestFirst) + useForSortAr = [i for i, j in sortedTuples] + keepAlignedAr = [j for i, j in sortedTuples] + return useForSortAr, keepAlignedAr + +def SortFastaFilenames(fastaFilenames): + speciesIndices = [] + for f in fastaFilenames: + start = f.rfind("Species") + speciesIndices.append(int(f[start+7:-3])) + indices, sortedFasta = SortArrayPairByFirst(speciesIndices, fastaFilenames) + return sortedFasta + + +# Get Info from seqs IDs file? +def GetSeqsInfo(inputDirectory, speciesToUse): + seqStartingIndices = [0] + nSeqs = 0 + for i, iFasta in enumerate(speciesToUse): + fastaFilename = inputDirectory + "Species%d.fa" % iFasta + with open(fastaFilename) as infile: + for line in infile: + if len(line) > 1 and line[0] == ">": + nSeqs+=1 + seqStartingIndices.append(nSeqs) + seqStartingIndices = seqStartingIndices[:-1] + nSpecies = len(speciesToUse) + return SequencesInfo(nSeqs=nSeqs, nSpecies=nSpecies, speciesToUse=speciesToUse, seqStartingIndices=seqStartingIndices) + +def GetSpeciesToUse(speciesIDsFN): + speciesToUse = [] + with open(speciesIDsFN, 'rb') as speciesF: + for line in speciesF: + if len(line) == 0 or line[0] == "#": continue + speciesToUse.append(int(line.split(":")[0])) + return speciesToUse + +def Fail(): + print("ERROR: An error occurred, please review previous error messages for more information.") + sys.exit() + +""" +IDExtractor +------------------------------------------------------------------------------- +""" + +def GetIDPairFromString(line): + return map(int, line.split("_")) + +class IDExtractor(object): + """IDExtractor deals with the fact that for different datasets a user will + want to extract a unique sequence ID from the fasta file accessions uin different + ways.""" + def GetIDToNameDict(self): + raise NotImplementedError("Should not be implemented") + def GetNameToIDDict(self): + raise NotImplementedError("Should not be implemented") + +class FullAccession(IDExtractor): + def __init__(self, idsFilename): + # only want the first part and nothing else (easy!) + self.idToNameDict = dict() + self.nameToIDDict = dict() + with open(idsFilename, 'rb') as idsFile: + for line in idsFile: + if line.startswith("#"): continue + id, accession = line.rstrip().split(": ", 1) + # Replace problematic characters + accession = accession.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") + if id in self.idToNameDict: + raise RuntimeError("ERROR: A duplicate id was found in the fasta files: % s" % id) + self.idToNameDict[id] = accession + self.nameToIDDict[accession] = id + + def GetIDToNameDict(self): + return self.idToNameDict + + def GetNameToIDDict(self): + return self.nameToIDDict + +class FirstWordExtractor(IDExtractor): + def __init__(self, idsFilename): + # only want the first part and nothing else (easy!) + self.idToNameDict = dict() + self.nameToIDDict = dict() + with open(idsFilename, 'rb') as idsFile: + for line in idsFile: + id, rest = line.split(": ", 1) + accession = rest.split(None, 1)[0] + # Replace problematic characters + accession = accession.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") + if accession in self.nameToIDDict: + raise RuntimeError("A duplicate accession was found using just first part: % s" % accession) + if id in self.idToNameDict: + raise RuntimeError("ERROR: A duplicate id was found in the fasta files: % s" % id) + self.idToNameDict[id] = accession + self.nameToIDDict[accession] = id + + def GetIDToNameDict(self): + return self.idToNameDict + + def GetNameToIDDict(self): + return self.nameToIDDict + +def IsWorkingDirectory(orthofinderWorkingDir): + ok = True + ok = ok and len(glob.glob(orthofinderWorkingDir + "clusters_OrthoFinder_*.txt_id_pairs.txt")) > 0 + ok = ok and len(glob.glob(orthofinderWorkingDir + "Species*.fa")) > 0 + return ok + +""" +Find results of previous run +------------------------------------------------------------------------------- +""" + +def GetOGsFile(userArg): + """returns the WorkingDirectory, ResultsDirectory and clusters_id_pairs filename""" + qSpecifiedResultsFile = False + if userArg == None: + print("ERROR: orthofinder_results_directory has not been specified") + Fail() + if os.path.isfile(userArg): + fn = os.path.split(userArg)[1] + if ("clusters_OrthoFinder_" not in fn) or ("txt_id_pairs.txt" not in fn): + print("ERROR:\n %s\nis neither a directory or a clusters_OrthoFinder_*.txt_id_pairs.txt file." % userArg) + Fail() + qSpecifiedResultsFile = True + # user has specified specific results file + elif userArg[-1] != os.path.sep: + userArg += os.path.sep + + # find required files + if qSpecifiedResultsFile: + orthofinderWorkingDir = os.path.split(userArg)[0] + os.sep + if not IsWorkingDirectory(orthofinderWorkingDir): + print("ERROR: cannot find files from OrthoFinder run in directory:\n %s" % orthofinderWorkingDir) + Fail() + else: + orthofinderWorkingDir = os.path.split(userArg)[0] if qSpecifiedResultsFile else userArg + if not IsWorkingDirectory(orthofinderWorkingDir): + orthofinderWorkingDir = userArg + "WorkingDirectory" + os.sep + if not IsWorkingDirectory(orthofinderWorkingDir): + print("ERROR: cannot find files from OrthoFinder run in directory:\n %s\nor\n %s\n" % (userArg, orthofinderWorkingDir)) + Fail() + + if qSpecifiedResultsFile: + print("Generating trees for orthogroups in file:\n %s" % userArg) + return orthofinderWorkingDir, orthofinderWorkingDir, userArg + else: + # identify orthogroups file + clustersFiles = glob.glob(orthofinderWorkingDir + "clusters_OrthoFinder_*.txt_id_pairs.txt") + orthogroupFiles = glob.glob(orthofinderWorkingDir + "OrthologousGroups*.txt") + if orthofinderWorkingDir != userArg: + orthogroupFiles += glob.glob(userArg + "OrthologousGroups*.txt") + # User may have specified a WorkingDirectory and results could be in directory above + if len(orthogroupFiles) < len(clustersFiles): + orthogroupFiles += glob.glob(userArg + ".." + os.sep + "OrthologousGroups*.txt") + clustersFiles = sorted(clustersFiles) + orthogroupFiles = sorted(orthogroupFiles) + if len(clustersFiles) > 1 or len(orthogroupFiles) > 1: + print("ERROR: Results from multiple OrthoFinder runs found\n") + print("Tab-delimiter OrthologousGroups*.txt files:") + for fn in orthogroupFiles: + print(" " + fn) + print("With corresponding cluster files:") + for fn in clustersFiles: + print(" " + fn) + print("\nPlease run with only one set of results in directories or specifiy the specific clusters_OrthoFinder_*.txt_id_pairs.txt file on the command line") + Fail() + + if len(clustersFiles) != 1 or len(orthogroupFiles) != 1: + print("ERROR: Results not found in <orthofinder_results_directory> or <orthofinder_results_directory>/WorkingDirectory") + print("\nCould not find:\n OrthologousGroups*.txt\nor\n clusters_OrthoFinder_*.txt_id_pairs.txt") + Fail() + + print("Generating trees for orthogroups in file:\n %s" % orthogroupFiles[0]) + print("and corresponding clusters file:\n %s" % clustersFiles[0]) + return orthofinderWorkingDir, userArg, clustersFiles[0] + +def PrintCitation(): + print("""\nWhen publishing work that uses OrthoFinder please cite: + D.M. Emms & S. Kelly (2015), OrthoFinder: solving fundamental biases in whole genome comparisons + dramatically improves orthogroup inference accuracy, Genome Biology 16:157.\n""") diff --git a/trees_for_orthogroups.py b/orthofinder/trees_from_MSA.py similarity index 62% rename from trees_for_orthogroups.py rename to orthofinder/trees_from_MSA.py index 7dbd61de3da03f1e85c80ab1cd3b34a2202a6cd4..ba8df3877660e12cfe3ec1ca82c2dcd63c2fe738 100755 --- a/trees_for_orthogroups.py +++ b/orthofinder/trees_from_MSA.py @@ -32,26 +32,11 @@ Created on Thu Sep 25 13:15:22 2014 """ import os import sys -import multiprocessing as mp -import time -import subprocess -import Queue import glob -import orthofinder +import scripts.mcl as MCL +import scripts.util as util -version = "1.0.0" - -def RunCommandSet(commandSet, qHideStdout): -# orthofinder.util.PrintTime("Runing command: %s" % commandSet[-1]) - if qHideStdout: - for cmd in commandSet: - subprocess.call(cmd, shell=True, stdout=subprocess.PIPE) - else: - for cmd in commandSet: - subprocess.call(cmd, shell=True) -# orthofinder.util.PrintTime("Finshed command: %s" % commandSet[-1]) - class FastaWriter(object): def __init__(self, fastaFileDir): self.SeqLists = dict() @@ -91,52 +76,12 @@ class FastaWriter(object): def SortSeqs(self, seqs): return sorted(seqs, key=lambda x: map(int, x.split("_"))) - -def Worker_RunCommand(cmd_queue, nProcesses, nToDo, qHideStdout): - """ repeatedly takes items to process from the queue until it is empty at which point it returns. Does not take a new task - if it can't acquire queueLock as this indicates the queue is being rearranged. - - Writes each commands output and stderr to a file - """ - while True: - try: - i, commandSet = cmd_queue.get(True, 1) - nDone = i - nProcesses + 1 - if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: - orthofinder.util.PrintTime("Done %d of %d" % (nDone, nToDo)) - RunCommandSet(commandSet, qHideStdout) - except Queue.Empty: - return - -def RunParallelCommandSets(nProcesses, commands, qHideStdout = False): - """nProcesss - the number of processes to run in parallel - commands - list of lists of commands where the commands in the inner list are completed in order (the i_th won't run until - the i-1_th has finished). - """ - # Setup the workers and run - cmd_queue = mp.Queue() - for i, cmd in enumerate(commands): - cmd_queue.put((i, cmd)) - runningProcesses = [mp.Process(target=Worker_RunCommand, args=(cmd_queue, nProcesses, i+1, qHideStdout)) for i_ in xrange(nProcesses)] - for proc in runningProcesses: - proc.start() - - for proc in runningProcesses: - while proc.is_alive(): - proc.join(10.) - time.sleep(2) def WriteTestFile(workingDir): testFN = workingDir + "SimpleTest.fa" with open(testFN, 'wb') as outfile: outfile.write(">a\nA\n>b\nA") return testFN - -def IsWorkingDirectory(orthofinderWorkingDir): - ok = True - ok = ok and len(glob.glob(orthofinderWorkingDir + "clusters_OrthoFinder_*.txt_id_pairs.txt")) > 0 - ok = ok and len(glob.glob(orthofinderWorkingDir + "Species*.fa")) > 0 - return ok class TreesForOrthogroups(object): def __init__(self, baseOutputDir, orthofinderWorkingDir): @@ -197,15 +142,15 @@ class TreesForOrthogroups(object): def DoTrees(self, ogs, idDict, nProcesses, nSwitchToMafft=500): testFN = WriteTestFile(self.orthofinderWorkingDir) - if not orthofinder.CanRunCommand("mafft %s" % testFN, qAllowStderr=True): + if not util.CanRunCommand("mafft %s" % testFN, qAllowStderr=True): print("ERROR: Cannot run mafft") print("Please check MAFFT is installed and that the executables are in the system path\n") return False - if not orthofinder.CanRunCommand("mafft %s" % testFN, qAllowStderr=True): + if not util.CanRunCommand("mafft %s" % testFN, qAllowStderr=True): print("ERROR: Cannot run mafft") print("Please check mafft is installed and that the executables are in the system path\n") return False - if not orthofinder.CanRunCommand("FastTree %s" % testFN, qAllowStderr=True): + if not util.CanRunCommand("FastTree %s" % testFN, qAllowStderr=True): print("ERROR: Cannot run FastTree") print("Please check FastTree is installed and that the executables are in the system path\n") return False @@ -241,9 +186,9 @@ class TreesForOrthogroups(object): print(cmd) print("") - RunParallelCommandSets(nProcesses, commandsSet) + util.RunParallelOrderedCommandLists(nProcesses, commandsSet) - orthofinder.PrintCitation() + util.PrintCitation() print("\nFasta files for orthogroups have been written to:\n %s\n" % (self.baseOutputDir + "Sequences/")) print("Multiple sequences alignments have been written to:\n %s\n" % (self.baseOutputDir + "Alignments/")) print("Gene trees have been written to:\n %s\n" % (self.baseOutputDir + "Trees/")) @@ -262,31 +207,31 @@ def PrintHelp(): print("""-t max_number_of_threads, --threads max_number_of_threads The maximum number of processes to be run simultaneously. The deafult is %d but this - should be increased by the user to the maximum number of cores available.\n""" % orthofinder.nThreadsDefault) + should be increased by the user to the maximum number of cores available.\n""" % util.nThreadsDefault) print("""-h, --help Print this help text""") - orthofinder.PrintCitation() + util.PrintCitation() def GetIDsDict(orthofinderWorkingDir): # sequence IDs idsFilename = orthofinderWorkingDir + "SequenceIDs.txt" try: - idExtract = orthofinder.FirstWordExtractor(idsFilename) + idExtract = util.FirstWordExtractor(idsFilename) idDict = idExtract.GetIDToNameDict() except RuntimeError as error: print(error.message) if error.message.startswith("ERROR"): print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename)) - orthofinder.Fail() + util.Fail() else: print("Tried to use only the first part of the accession in order to list the sequences in each orthologous group more concisely but these were not unique. Will use the full accession line instead.") try: - idExtract = orthofinder.FullAccession(idsFilename) + idExtract = util.FullAccession(idsFilename) idDict = idExtract.GetIDToNameDict() except: print("ERROR: %s contains a duplicate ID. If %s was prepared manually then please check the IDs are correct. " % (idsFilename, idsFilename)) - orthofinder.Fail() + util.Fail() # species names speciesDict = dict() @@ -299,72 +244,8 @@ def GetIDsDict(orthofinderWorkingDir): idDict = {seqID:speciesDict[seqID.split("_")[0]] + "_" + name for seqID, name in idDict.items()} return idDict -def GetOGsFile(userArg): - """returns the WorkingDirectory, ResultsDirectory and clusters_id_pairs filename""" - qSpecifiedResultsFile = False - if userArg == None: - print("ERROR: orthofinder_results_directory has not been specified") - orthofinder.Fail() - if os.path.isfile(userArg): - fn = os.path.split(userArg)[1] - if ("clusters_OrthoFinder_" not in fn) or ("txt_id_pairs.txt" not in fn): - print("ERROR:\n %s\nis neither a directory or a clusters_OrthoFinder_*.txt_id_pairs.txt file." % userArg) - orthofinder.Fail() - qSpecifiedResultsFile = True - # user has specified specific results file - elif userArg[-1] != os.path.sep: - userArg += os.path.sep - - # find required files - if qSpecifiedResultsFile: - orthofinderWorkingDir = os.path.split(userArg)[0] + os.sep - if not IsWorkingDirectory(orthofinderWorkingDir): - print("ERROR: cannot find files from OrthoFinder run in directory:\n %s" % orthofinderWorkingDir) - orthofinder.Fail() - else: - orthofinderWorkingDir = os.path.split(userArg)[0] if qSpecifiedResultsFile else userArg - if not IsWorkingDirectory(orthofinderWorkingDir): - orthofinderWorkingDir = userArg + "WorkingDirectory" + os.sep - if not IsWorkingDirectory(orthofinderWorkingDir): - print("ERROR: cannot find files from OrthoFinder run in directory:\n %s\nor\n %s\n" % (userArg, orthofinderWorkingDir)) - orthofinder.Fail() - - if qSpecifiedResultsFile: - print("Generating trees for orthogroups in file:\n %s" % userArg) - return orthofinderWorkingDir, orthofinderWorkingDir, userArg - else: - # identify orthogroups file - clustersFiles = glob.glob(orthofinderWorkingDir + "clusters_OrthoFinder_*.txt_id_pairs.txt") - orthogroupFiles = glob.glob(orthofinderWorkingDir + "OrthologousGroups*.txt") - if orthofinderWorkingDir != userArg: - orthogroupFiles += glob.glob(userArg + "OrthologousGroups*.txt") - # User may have specified a WorkingDirectory and results could be in directory above - if len(orthogroupFiles) < len(clustersFiles): - orthogroupFiles += glob.glob(userArg + ".." + os.sep + "OrthologousGroups*.txt") - clustersFiles = sorted(clustersFiles) - orthogroupFiles = sorted(orthogroupFiles) - if len(clustersFiles) > 1 or len(orthogroupFiles) > 1: - print("ERROR: Results from multiple OrthoFinder runs found\n") - print("Tab-delimiter OrthologousGroups*.txt files:") - for fn in orthogroupFiles: - print(" " + fn) - print("With corresponding cluster files:") - for fn in clustersFiles: - print(" " + fn) - print("\nPlease run with only one set of results in directories or specifiy the specific clusters_OrthoFinder_*.txt_id_pairs.txt file on the command line") - orthofinder.Fail() - - if len(clustersFiles) != 1 or len(orthogroupFiles) != 1: - print("ERROR: Results not found in <orthofinder_results_directory> or <orthofinder_results_directory>/WorkingDirectory") - print("\nCould not find:\n OrthologousGroups*.txt\nor\n clusters_OrthoFinder_*.txt_id_pairs.txt") - orthofinder.Fail() - - print("Generating trees for orthogroups in file:\n %s" % orthogroupFiles[0]) - print("and corresponding clusters file:\n %s" % clustersFiles[0]) - return orthofinderWorkingDir, userArg, clustersFiles[0] - if __name__ == "__main__": - print("\nOrthoFinder Alignments and Trees version %s Copyright (C) 2015 David Emms\n" % version) + print("\nOrthoFinder Alignments and Trees version %s Copyright (C) 2015 David Emms\n" % util.version) print(""" This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it under certain conditions. For details please see the License.md that came with this software.\n""") @@ -372,12 +253,6 @@ if __name__ == "__main__": PrintHelp() sys.exit() - v = map(int, orthofinder.version.split(".")) - v = 100 * v[0] + 10*v[1] + v[2] - if v < 28: - print("ERROR: OrthoFinder program has not been updated, please update 'orthofinder.py' to the version %s\n" % version) - orthofinder.Fail() - # Get arguments userDir = None nProcesses = None @@ -388,26 +263,26 @@ if __name__ == "__main__": if arg == "-t" or arg == "--threads": if len(args) == 0: print("Missing option for command line argument -t") - orthofinder.Fail() + util.Fail() arg = args.pop(0) try: nProcesses = int(arg) except: print("Incorrect argument for number of threads: %s" % arg) - orthofinder.Fail() + util.Fail() else: userDir = arg # Check arguments - orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = GetOGsFile(userDir) + orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = util.GetOGsFile(userDir) if nProcesses == None: print("""Number of parallel processes has not been specified, will use the default value. Number of parallel processes can be specified using the -t option\n""") - nProcesses = orthofinder.nThreadsDefault + nProcesses = util.nThreadsDefault print("Using %d threads for alignments and trees\n" % nProcesses) - ogs = orthofinder.MCL.GetPredictedOGs(clustersFilename_pairs) + ogs = MCL.GetPredictedOGs(clustersFilename_pairs) idDict = GetIDsDict(orthofinderWorkingDir) treeGen = TreesForOrthogroups(orthofinderResultsDir, orthofinderWorkingDir)