diff --git a/get_orthologues.py b/get_orthologues.py index 47a14b37bd2c061d316e99a3fc9e6d65e4112118..90e8ba459c8d9ad3c5a9b96eb31b1b2f39213f58 100755 --- a/get_orthologues.py +++ b/get_orthologues.py @@ -187,10 +187,13 @@ def RunAstral(ogSet, treesPat, workingDir): # ============================================================================================================================== # DendroBlast -def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo): +def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo): while True: try: - args = cmd_queue.get(True, 1) + i, args = cmd_queue.get(True, 1) + nDone = i - nProcesses + 1 + if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: + print("Done %d of %d" % (nDone, nToDo)) B = orthofinder.BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False) with open(fileInfo.outputDir + "Bit%d_%d.pic" % args, 'wb') as outfile: pic.dump(B, outfile, protocol = orthofinder.picProtocol) @@ -217,10 +220,12 @@ class DendroBLASTTrees(object): def ReadAndPickle(self): cmd_queue = mp.Queue() + i = 0 for iSp in xrange(len(self.ogSet.seqsInfo.speciesToUse)): for jSp in xrange(len(self.ogSet.seqsInfo.speciesToUse)): - cmd_queue.put((iSp, jSp)) - runningProcesses = [mp.Process(target=Worker_BlastScores, args=(cmd_queue, self.ogSet.seqsInfo, self.ogSet.fileInfo)) for i_ in xrange(nThreads)] + cmd_queue.put((i, (iSp, jSp))) + i+=1 + runningProcesses = [mp.Process(target=Worker_BlastScores, args=(cmd_queue, self.ogSet.seqsInfo, self.ogSet.fileInfo, nThreads, i)) for i_ in xrange(nThreads)] for proc in runningProcesses: proc.start() for proc in runningProcesses: @@ -244,7 +249,7 @@ class DendroBLASTTrees(object): nSeqs = self.NumberOfSequences(self.species) ogMatrices = [np.zeros((n, n)) for n in nGenes] for iiSp, sp1 in enumerate(self.species): - orthofinder.util.PrintTime("Species %d" % sp1) + orthofinder.util.PrintTime("Processing species %d" % sp1) Bs = [orthofinder.LoadMatrix("Bit", self.ogSet.fileInfo, iiSp, jjSp) for jjSp in xrange(len(self.species))] mins = np.ones((nSeqs[sp1], 1), dtype=np.float64)*9e99 maxes = np.zeros((nSeqs[sp1], 1), dtype=np.float64) @@ -513,6 +518,10 @@ if __name__ == "__main__": print("0. Getting Orthologues") print("----------------------") orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = tfo.GetOGsFile(userDir) + ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = idextractor.FirstWordExtractor) + if len(ogSet.speciesToUse) < 4: + print("ERROR: Not enough species to infer species tree") + orthofinder.Fail() if nProcesses == None: print("""\nNumber of parallel processes has not been specified, will use the default value. @@ -527,7 +536,6 @@ Number of parallel processes can be specified using the -t option.""") print("\n2. Reading sequence similarity scores") print( "-------------------------------------") resultsDir = orthofinder.util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") - ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = idextractor.FirstWordExtractor) db = DendroBLASTTrees(ogSet, resultsDir) db.ReadAndPickle() @@ -536,20 +544,21 @@ Number of parallel processes can be specified using the -t option.""") print("\n4. Best outgroup(s) for species tree") print( "------------------------------------") roots, clusters, rootedSpeciesTreeFN = rfd.GetRoot(spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt = 1) - db.RenameTreeTaxa(rootedSpeciesTreeFN, resultsDir + "SpeciesTree_rooted.txt", db.ogSet.SpeciesDict(), qFixNegatives=True) spDict = ogSet.SpeciesDict() - for r in roots: + for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): + resultsDir_new = resultsDir + "Root%d/" % i + os.mkdir(resultsDir_new) + db.RenameTreeTaxa(speciesTree_fn, resultsDir_new + "SpeciesTree_%d_rooted.txt" % i, db.ogSet.SpeciesDict(), qFixNegatives=True) print(", ".join([spDict[s] for s in r])) # speciesTreeFN = RunAstral(ogSet, db.treesPat, resultsDir) - print("\n5. Reconciling gene and species trees") - print( "-------------------------------------") - dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, rootedSpeciesTreeFN, db.workingDir) + print("\n5. Reconciling gene and species trees") + print( "-------------------------------------") + dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) - # Orthologue lists - print("\n6. Inferring orthologues from gene trees") - print( "----------------------------------------") - pt.get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, db.workingDir) + # Orthologue lists + print("\n6. Inferring orthologues from gene trees") + print( "----------------------------------------") + pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) - print("\nDone!") diff --git a/orthofinder.py b/orthofinder.py index 23ebd3235d559785ddf98fb2af1171c2b81c368b..580bc37a22d98e3456cbb4a9618c49c3071edff4 100644 --- a/orthofinder.py +++ b/orthofinder.py @@ -688,7 +688,7 @@ class BlastFileProcessor(object): sys.stderr.write("but found a query/hit in the Blast%d_%d.txt for sequence %d_%d (i.e. %s sequence in species %d).\n" % (iSpecies, jSpecies, kSpecies, sequencekID, ord(sequencekID+1), kSpecies)) sys.exit() except Exception: - sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (fileInfo.inputDir, iSpecies, jSpecies)) + sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (fileInfo.inputDir, seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies])) sys.stderr.write("\t".join(row) + "\n") sys.exit() return B diff --git a/process_trees.py b/process_trees.py index 8deac33a256cbf8cfe708efa261abc92774a8c43..9e70ab8d0ab4449ddcca8df345c09d6af1cf4979 100644 --- a/process_trees.py +++ b/process_trees.py @@ -47,7 +47,6 @@ def GetSpeciesGenesInfo(ogSet): seqsInfo = orthofinder.GetSeqsInfo(ogSet.workingDirOF, speciesLabels) genenumbers = list(np.diff(seqsInfo.seqStartingIndices)) genenumbers.append(seqsInfo.nSeqs - seqsInfo.seqStartingIndices[-1]) - genenumbers = [genenumbers[i] for i in speciesLabels] return speciesLabels, genenumbers def one_to_one_efficient(orthodict, genenumbers, speciesLabels, iSpecies, outputDir): diff --git a/root_from_duplications.py b/root_from_duplications.py index 3b3416d5bb357c1598255e10c47f2901bf52ff00..98e6f1eb6a01a77ffb8628184b047dc715f40e9f 100644 --- a/root_from_duplications.py +++ b/root_from_duplications.py @@ -395,11 +395,11 @@ def ParsimonyRoot(allSpecies, clades, supported_clusters, qPrint=True): if score == m: if len(clade) > len(allSpecies)/2: root.append(allSpecies.difference(clade)) - if qPrint: print(", ".join(root[-1])) +# if qPrint: print(", ".join(root[-1])) else: root.append(clade) - if qPrint: print(", ".join(root[-1])) - if qPrint: print("") +# if qPrint: print(", ".join(root[-1])) +# if qPrint: print("") return root def PlotTree(speciesTree, treesDir, supported_clusters, qSimplePrint=True): @@ -438,12 +438,15 @@ def GetRoot(speciesTreeFN, treesDir, GeneToSpeciesMap, nProcessors, treeFmt=None clusters = [] for l in list_of_lists: clusters.extend(l) - roots = ParsimonyRoot(species, dict_clades.keys(), clusters, qPrint=False) - speciesTree_rootedFN = os.path.splitext(speciesTreeFN)[0] + "_rooted.txt" - speciesTree = RootAtClade(speciesTree, roots[0]) + roots = list(set(ParsimonyRoot(species, dict_clades.keys(), clusters, qPrint=True))) + speciesTrees_rootedFNs =[] + for i, r in enumerate(roots): + speciesTree = RootAtClade(speciesTree, r) + speciesTree_rootedFN = os.path.splitext(speciesTreeFN)[0] + "_%d_rooted.txt" % i # speciesTree = LabelNodes() - speciesTree.write(outfile=speciesTree_rootedFN, format=4) - return roots, clusters, speciesTree_rootedFN + speciesTree.write(outfile=speciesTree_rootedFN, format=4) + speciesTrees_rootedFNs.append(speciesTree_rootedFN) + return roots, clusters, speciesTrees_rootedFNs if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -485,7 +488,8 @@ if __name__ == "__main__": ParsimonyRoot(species, dict_clades.keys(), clusters) if args.verbose: PlotTree(speciesTree, args.input_tree, clusters) else: - root, clusters = GetRoot(args.Species_tree, args.input_tree, GeneToSpecies, nProcs, treeFmt = 1) + root, clusters, _ = GetRoot(args.Species_tree, args.input_tree, GeneToSpecies, nProcs, treeFmt = 1) + for r in root: print(r) speciesTree = ete2.Tree(args.Species_tree, format=1) if args.verbose: PlotTree(speciesTree, args.input_tree, clusters, qSimplePrint=False) - \ No newline at end of file + diff --git a/trees_for_orthogroups.py b/trees_for_orthogroups.py index 5bf2912c42415a142e7497b7a40741f80ef9e949..da3f44444e18fe566f22c88c075fa67da8d223ee 100755 --- a/trees_for_orthogroups.py +++ b/trees_for_orthogroups.py @@ -102,7 +102,7 @@ def Worker_RunCommand(cmd_queue, nProcesses, nToDo, qHideStdout): try: i, commandSet = cmd_queue.get(True, 1) nDone = i - nProcesses + 1 - if nDone >= 0 and divmod(nDone, 100)[1] == 0: + if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: print("Done %d of %d" % (nDone, nToDo)) RunCommandSet(commandSet, qHideStdout) except Queue.Empty: