diff --git a/idextractor.py b/idextractor.py deleted file mode 100644 index ef6820389f538b25e8420c42fa0b3ddf27925f65..0000000000000000000000000000000000000000 --- a/idextractor.py +++ /dev/null @@ -1,57 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Fri Jun 10 14:10:43 2016 - -@author: david -""" - -class IDExtractor(object): - """IDExtractor deals with the fact that for different datasets a user will - want to extract a unique sequence ID from the fasta file accessions uin different - ways.""" - def GetIDToNameDict(self): - raise NotImplementedError("Should not be implemented") - def GetNameToIDDict(self): - raise NotImplementedError("Should not be implemented") - -class FullAccession(IDExtractor): - def __init__(self, idsFilename): - # only want the first part and nothing else (easy!) - self.idToNameDict = dict() - self.nameToIDDict = dict() - with open(idsFilename, 'rb') as idsFile: - for line in idsFile: - if line.startswith("#"): continue - id, accession = line.rstrip().split(": ", 1) - if id in self.idToNameDict: - raise RuntimeError("ERROR: A duplicate id was found in the fasta files: % s" % id) - self.idToNameDict[id] = accession - self.nameToIDDict[accession] = id - - def GetIDToNameDict(self): - return self.idToNameDict - - def GetNameToIDDict(self): - return self.nameToIDDict - -class FirstWordExtractor(IDExtractor): - def __init__(self, idsFilename): - # only want the first part and nothing else (easy!) - self.idToNameDict = dict() - self.nameToIDDict = dict() - with open(idsFilename, 'rb') as idsFile: - for line in idsFile: - id, rest = line.split(": ", 1) - accession = rest.split(None, 1)[0] - if accession in self.nameToIDDict: - raise RuntimeError("A duplicate accession was found using just first part: % s" % accession) - if id in self.idToNameDict: - raise RuntimeError("ERROR: A duplicate id was found in the fasta files: % s" % id) - self.idToNameDict[id] = accession - self.nameToIDDict[accession] = id - - def GetIDToNameDict(self): - return self.idToNameDict - - def GetNameToIDDict(self): - return self.nameToIDDict \ No newline at end of file diff --git a/orthofinder.py b/orthofinder.py index 580bc37a22d98e3456cbb4a9618c49c3071edff4..859a7ea04b1588168639d61532c94fb51024ed33 100644 --- a/orthofinder.py +++ b/orthofinder.py @@ -47,6 +47,9 @@ import Queue # Y import warnings # Y import time # Y +sys.path.append(os.path.split(os.path.abspath(__file__))[0] + "/scripts") +import get_orthologues + version = "0.6.1" fastaExtensions = {"fa", "faa", "fasta", "fas"} picProtocol = 1 @@ -179,6 +182,8 @@ class FullAccession(IDExtractor): for line in idsFile: if line.startswith("#"): continue id, accession = line.rstrip().split(": ", 1) + # Replace problematic characters + accession = accession.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") if id in self.idToNameDict: raise RuntimeError("ERROR: A duplicate id was found in the fasta files: % s" % id) self.idToNameDict[id] = accession @@ -199,6 +204,8 @@ class FirstWordExtractor(IDExtractor): for line in idsFile: id, rest = line.split(": ", 1) accession = rest.split(None, 1)[0] + # Replace problematic characters + accession = accession.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") if accession in self.nameToIDDict: raise RuntimeError("A duplicate accession was found using just first part: % s" % accession) if id in self.idToNameDict: @@ -459,8 +466,9 @@ class MCL: for iSpecies in xrange(nSpecies): row.append(", ".join(sorted(ogDict[iSpecies]))) thisOutputWriter.writerow(row) - print("""Orthologous groups have been written to tab-delimited files:\n %s\n %s""" % (outputFilename, singleGeneFilename)) - print("""And in OrthoMCL format:\n %s""" % (outputFilename[:-3] + "txt")) + resultsFilesString = "Orthologous groups have been written to tab-delimited files:\n %s\n %s\n" % (outputFilename, singleGeneFilename) + resultsFilesString += "And in OrthoMCL format:\n %s" % (outputFilename[:-3] + "txt") + return resultsFilesString """ scnorm @@ -1048,6 +1056,7 @@ if __name__ == "__main__": qUseFastaFiles = False # local to argument checking qXML = False qOnlyPrepare = False + qOrthologues = True # qUseSubset = False # forget once arguments parsed and have decided what the BLAST files are (Effectively part of BLASTFileProcessor) # Files @@ -1122,6 +1131,7 @@ if __name__ == "__main__": # workingDir_previous = GetDirectoryArgument(arg, args) elif arg == "-p" or arg == "--prepare": qOnlyPrepare = True + qOrthologues = False elif arg == "-h" or arg == "--help": PrintHelp() sys.exit() @@ -1192,9 +1202,9 @@ if __name__ == "__main__": workingDir = resultsDir + "WorkingDirectory" + os.sep os.mkdir(workingDir) if qUsePrecalculatedBlast: - print("%d threads for BLAST searches" % nBlast) + print("%d thread(s) for BLAST searches" % nBlast) if not qOnlyPrepare: - print("%d threads for OrthoFinder algorithm" % nProcessAlg) + print("%d thread(s) for OrthoFinder algorithm" % nProcessAlg) # check for BLAST+ and MCL - else instruct how to install and add to path print("\n1. Checking required programs are installed") @@ -1302,8 +1312,6 @@ if __name__ == "__main__": while proc.is_alive(): proc.join() - print("Done!") - # remove BLAST databases for f in glob.glob(workingDir + "BlastDBSpecies*"): os.remove(f) @@ -1352,15 +1360,23 @@ if __name__ == "__main__": print("\n6. Creating files for Orthologous Groups") print( "----------------------------------------") - PrintCitation() + if not qOrthologues: PrintCitation() ogs = MCL.GetPredictedOGs(clustersFilename_pairs) resultsBaseFilename = util.GetUnusedFilename(resultsDir + "OrthologousGroups", ".csv")[:-4] # remove .csv from base filename resultsBaseFilename = resultsDir + "OrthologousGroups" + ("" if iResultsVersion == 0 else "_%d" % iResultsVersion) idsDict = MCL.WriteOrthogroupFiles(ogs, [idsFilename], resultsBaseFilename, clustersFilename_pairs) - MCL.CreateOrthogroupTable(ogs, idsDict, speciesIdsFilename, speciesToUse, resultsBaseFilename) + orthogroupsResultsFilesString = MCL.CreateOrthogroupTable(ogs, idsDict, speciesIdsFilename, speciesToUse, resultsBaseFilename) + print(orthogroupsResultsFilesString) if qXML: numbersOfSequences = list(np.diff(seqsInfo.seqStartingIndices)) numbersOfSequences.append(seqsInfo.nSeqs - seqsInfo.seqStartingIndices[-1]) orthoxmlFilename = resultsBaseFilename + ".orthoxml" MCL.WriteOrthoXML(speciesInfo, ogs, numbersOfSequences, idsDict, orthoxmlFilename, speciesToUse) - print("\n") + + if qOrthologues: + print("\nRunning Orthologue Prediction") + print( "=============================") + orthologuesResultsFilesString = get_orthologues.GetOrthologues(workingDir, resultsDir, clustersFilename_pairs, nBlast) + print(orthogroupsResultsFilesString) + print(orthologuesResultsFilesString.rstrip()) + PrintCitation() diff --git a/get_orthologues.py b/scripts/get_orthologues.py similarity index 85% rename from get_orthologues.py rename to scripts/get_orthologues.py index 90e8ba459c8d9ad3c5a9b96eb31b1b2f39213f58..ab6a70dac4b081e17bf75e03562317e69bfd2937 100755 --- a/get_orthologues.py +++ b/scripts/get_orthologues.py @@ -16,8 +16,8 @@ import itertools import multiprocessing as mp import Queue +sys.path.append(os.path.split(os.path.abspath(__file__))[0] + "/..") import trees_for_orthogroups as tfo -import idextractor import orthofinder import root_from_duplications as rfd import process_trees as pt @@ -61,11 +61,11 @@ class Seq(object): # ============================================================================================================================== class OrthoGroupsSet(object): - def __init__(self, orthofinderWorkingDir, clustersFilename_pairs, idExtractor = idextractor.FirstWordExtractor): + def __init__(self, orthofinderWorkingDir, clustersFilename_pairs, idExtractor = orthofinder.FirstWordExtractor): self.workingDirOF = orthofinderWorkingDir self.seqIDsFN = orthofinderWorkingDir + "SequenceIDs.txt" self.speciesIDsFN = orthofinderWorkingDir + "SpeciesIDs.txt" - self.speciesIDsEx = idextractor.FullAccession(self.speciesIDsFN) + self.speciesIDsEx = orthofinder.FullAccession(self.speciesIDsFN) self._Spec_SeqIDs = None self._extractor = idExtractor self.clustersFN = clustersFilename_pairs @@ -193,7 +193,7 @@ def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo): i, args = cmd_queue.get(True, 1) nDone = i - nProcesses + 1 if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: - print("Done %d of %d" % (nDone, nToDo)) + orthofinder.util.PrintTime("Done %d of %d" % (nDone, nToDo)) B = orthofinder.BlastFileProcessor.GetBLAST6Scores(seqsInfo, fileInfo, *args, qExcludeSelfHits = False) with open(fileInfo.outputDir + "Bit%d_%d.pic" % args, 'wb') as outfile: pic.dump(B, outfile, protocol = orthofinder.picProtocol) @@ -201,9 +201,10 @@ def Worker_BlastScores(cmd_queue, seqsInfo, fileInfo, nProcesses, nToDo): return class DendroBLASTTrees(object): - def __init__(self, ogSet, outD): + def __init__(self, ogSet, outD, nProcesses): self.outD = outD self.ogSet = ogSet + self.nProcesses = nProcesses self.species = sorted(map(int, self.ogSet.SpeciesDict().keys())) treesDir = outD + "Trees/" self.workingDir = outD + "WorkingDirectory/" @@ -233,7 +234,7 @@ class DendroBLASTTrees(object): proc.join() def NumberOfSequences(self, species): - ids = ogSet.SequenceDict() + ids = self.ogSet.SequenceDict() counts = Counter([g.split("_")[0] for g in ids.keys()]) counts = {int(k):v for k,v in counts.items() if int(k) in species} return counts @@ -350,7 +351,7 @@ class DendroBLASTTrees(object): cmds_geneTrees = self.PrepareGeneTreeCommand() print("\n3. Inferring gene and species trees") print( "-----------------------------------") - tfo.RunParallelCommandSets(nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout = True) + tfo.RunParallelCommandSets(self.nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout = True) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.ogs)): self.RenameTreeTaxa(self.treesPatIDs % iog, self.treesPat % iog, seqDict, qFixNegatives=True) @@ -383,13 +384,11 @@ def RootGeneTreesArbitrarily(treesPat, nOGs, outputDir): for leaf in t: R = leaf break - print("%s is a zero length tree, it will be rooted at a arbitrary node" % outFN) elif AllEqualBranchLengths(t): # more generally, for any branch length all branches could have that same length for leaf in t: R = leaf break - print("%s has branches all of equal length, it will be rooted at a arbitrary node" % outFN) t.set_outgroup(R) t.resolve_polytomy() t.write(outfile = outFN) @@ -399,7 +398,6 @@ def RootGeneTreesArbitrarily(treesPat, nOGs, outputDir): for leaf in t: R = leaf break - print("%s using an arbitrary node" % outFN) t.set_outgroup(R) t.resolve_polytomy() t.write(outfile = outFN) @@ -488,7 +486,83 @@ def PrintHelp(): print("""-h, --help Print this help text""") orthofinder.PrintCitation() + +def GetResultsFilesString(rootedSpeciesTreeFN): + st = "" + baseResultsDir = os.path.abspath(os.path.split(rootedSpeciesTreeFN[0])[0] + "./../Trees/") + st += "Gene trees:\n %s\n" % baseResultsDir + if len(rootedSpeciesTreeFN) == 1: + resultsDir = os.path.split(rootedSpeciesTreeFN[0])[0] + st += "Rooted species tree:\n %s\n" % rootedSpeciesTreeFN[0] + st += "Species-by-species orthologues:\n %s\n" % resultsDir + else: + st += "\nMultiple potential outgroups were identified for the species tree. Each case has been analysed separately.\n" + st+= "Please review the rooted species trees and use the results corresponding to the correct one.\n\n" + for tFN in rootedSpeciesTreeFN: + resultsDir = os.path.split(tFN)[0] + "/" + st += "Rooted species tree:\n %s\n" % tFN + st += "Species-by-species orthologues directory:\n %s\n\n" % resultsDir + return st + + +def GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs, nProcesses): + ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = orthofinder.FirstWordExtractor) + if len(ogSet.speciesToUse) < 4: + print("ERROR: Not enough species to infer species tree") + orthofinder.Fail() + + print("\n1. Checking required programs are installed") + print( "-------------------------------------------") + if not CanRunDependencies(orthofinderWorkingDir): orthofinder.Fail() + + print("\n2. Reading sequence similarity scores") + print( "-------------------------------------") + resultsDir = orthofinder.util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") + db = DendroBLASTTrees(ogSet, resultsDir, nProcesses) + db.ReadAndPickle() + nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() + + print("\n4. Best outgroup(s) for species tree") + print( "------------------------------------") + spDict = ogSet.SpeciesDict() + roots, clusters, rootedSpeciesTreeFN, nSupport = rfd.GetRoot(spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt = 1) + if len(roots) > 1: + print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) + print("Best outgroups for species tree:") + else: + print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) + print("Best outgroup for species tree:") + for r in roots: print(" " + (", ".join([spDict[s] for s in r])) ) + +# speciesTreeFN = RunAstral(ogSet, db.treesPat, resultsDir) + qMultiple = len(roots) > 1 + if qMultiple: print("\nAnalysing each of the potential species tree roots.") + resultsSpeciesTrees = [] + for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): + if qMultiple: + resultsDir_new = resultsDir + "Orthologues_for_potential_outgroup_%d/" % i + else: + resultsDir_new = resultsDir + "Orthologues/" + os.mkdir(resultsDir_new) + resultsSpeciesTrees.append(resultsDir_new + "SpeciesTree_rooted.txt") + db.RenameTreeTaxa(speciesTree_fn, resultsSpeciesTrees[-1], db.ogSet.SpeciesDict(), qFixNegatives=True) + + print("\n5%s. Reconciling gene and species trees" % ("-%d"%i if qMultiple else "")) + print( "-------------------------------------" + ("--" if qMultiple else "")) + print("Root: " + (", ".join([spDict[s] for s in r]))) + dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) + + # Orthologue lists + print("\n6%s. Inferring orthologues from gene trees" % ("-%d"%i if qMultiple else "")) + print( "----------------------------------------" + ("--" if qMultiple else "")) + pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) + + print("\n7. Writing results files") + print( "------------------------") + + return GetResultsFilesString(resultsSpeciesTrees) + if __name__ == "__main__": if len(sys.argv) < 2 or sys.argv[1] == "--help" or sys.argv[1] == "-h": PrintHelp() @@ -517,48 +591,16 @@ if __name__ == "__main__": # Check arguments print("0. Getting Orthologues") print("----------------------") - orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = tfo.GetOGsFile(userDir) - ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = idextractor.FirstWordExtractor) - if len(ogSet.speciesToUse) < 4: - print("ERROR: Not enough species to infer species tree") - orthofinder.Fail() - if nProcesses == None: print("""\nNumber of parallel processes has not been specified, will use the default value. Number of parallel processes can be specified using the -t option.""") nProcesses = orthofinder.nThreadsDefault print("Using %d threads for alignments and trees" % nProcesses) - print("\n1. Checking required programs are installed") - print( "-------------------------------------------") - if not CanRunDependencies(orthofinderWorkingDir): orthofinder.Fail() - - print("\n2. Reading sequence similarity scores") - print( "-------------------------------------") - resultsDir = orthofinder.util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") - - db = DendroBLASTTrees(ogSet, resultsDir) - db.ReadAndPickle() - nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() - - print("\n4. Best outgroup(s) for species tree") - print( "------------------------------------") - roots, clusters, rootedSpeciesTreeFN = rfd.GetRoot(spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt = 1) + orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = tfo.GetOGsFile(userDir) + resultsString = GetOrthologues(orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs, nProcesses) + print(resultsString) + orthofinder.PrintCitation() - spDict = ogSet.SpeciesDict() - for i, (r, speciesTree_fn) in enumerate(zip(roots, rootedSpeciesTreeFN)): - resultsDir_new = resultsDir + "Root%d/" % i - os.mkdir(resultsDir_new) - db.RenameTreeTaxa(speciesTree_fn, resultsDir_new + "SpeciesTree_%d_rooted.txt" % i, db.ogSet.SpeciesDict(), qFixNegatives=True) - print(", ".join([spDict[s] for s in r])) -# speciesTreeFN = RunAstral(ogSet, db.treesPat, resultsDir) - - print("\n5. Reconciling gene and species trees") - print( "-------------------------------------") - dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, speciesTree_fn, db.workingDir) - # Orthologue lists - print("\n6. Inferring orthologues from gene trees") - print( "----------------------------------------") - pt.get_orthologue_lists(ogSet, resultsDir_new, dlcparResultsDir, db.workingDir) diff --git a/process_trees.py b/scripts/process_trees.py similarity index 97% rename from process_trees.py rename to scripts/process_trees.py index 9e70ab8d0ab4449ddcca8df345c09d6af1cf4979..94d696b40f1e84360b09d71011861eae4c072be0 100644 --- a/process_trees.py +++ b/scripts/process_trees.py @@ -10,6 +10,7 @@ from ete2 import Tree from scipy import sparse from collections import defaultdict +sys.path.append(os.path.split(__file__)[0] + "/..") import orthofinder def natural_sort_key(s, _nsre=re.compile('([0-9]+)')): @@ -54,7 +55,7 @@ def one_to_one_efficient(orthodict, genenumbers, speciesLabels, iSpecies, output try to mostly deal with iSpecies which is the ordinal number not the label it is given """ #Creates all matrices and appends them to matrixlist. - sys.stdout.write("Processing orthologues for species %d" % iSpecies) + orthofinder.util.PrintTime("Processing orthologues for species %d" % iSpecies) matrixlist = [] numspecies = len(speciesLabels) speciesLabelsReverse = {label:i for i, label in enumerate(speciesLabels)} @@ -63,7 +64,6 @@ def one_to_one_efficient(orthodict, genenumbers, speciesLabels, iSpecies, output matrixlist.append(sparse.lil_matrix((genenumbers[iSpecies], genenumbers[j]), dtype=np.dtype(np.int8))) else: matrixlist.append(None) - print(' - done') #Fill matrices with orthodata iSpecieslist = [x for x in orthodict if x.startswith('%d_' % speciesLabels[iSpecies])] for count, queryGene in enumerate(iSpecieslist): @@ -144,7 +144,6 @@ def get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, workingDir): if __name__ == "__main__": - import idextractor import get_orthologues as go d = "/home/david/projects/Orthology/OrthoFinder/Development/Orthologues/ExampleDataset_Results/" orthofinderWorkingDir = d + "WorkingDirectory/" @@ -153,7 +152,7 @@ if __name__ == "__main__": dlcparResultsDir = resultsDir + "/WorkingDirectory/dlcpar/" workingDir = resultsDir + "/WorkingDirectory/" - ogSet = go.OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = idextractor.FirstWordExtractor) + ogSet = go.OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = orthofinder.FirstWordExtractor) get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, workingDir) \ No newline at end of file diff --git a/root_from_duplications.py b/scripts/root_from_duplications.py similarity index 93% rename from root_from_duplications.py rename to scripts/root_from_duplications.py index 98e6f1eb6a01a77ffb8628184b047dc715f40e9f..bcb71cef9edaad57566a74ce851ad4648357d508 100644 --- a/root_from_duplications.py +++ b/scripts/root_from_duplications.py @@ -374,7 +374,7 @@ def RootAtClade(tree, accs_in_clade): tree.set_outgroup(node) return tree -def ParsimonyRoot(allSpecies, clades, supported_clusters, qPrint=True): +def ParsimonyRoot(allSpecies, clades, supported_clusters): c = Counter(supported_clusters) contradictions = dict() for clade in clades: @@ -386,21 +386,17 @@ def ParsimonyRoot(allSpecies, clades, supported_clusters, qPrint=True): contradictions[clade] = against m = min(contradictions.values()) n = len(supported_clusters) - if qPrint: - print("Observed %d duplications" % n) - print("%d support the best root and %d contradict it" % (n-m, m)) - print("Best root(s)") - root = [] + nSupport = n-m + roots = [] for clade, score in contradictions.items(): if score == m: if len(clade) > len(allSpecies)/2: - root.append(allSpecies.difference(clade)) -# if qPrint: print(", ".join(root[-1])) + roots.append(allSpecies.difference(clade)) + elif len(clade) == len(allSpecies)/ 2: + if allSpecies.difference(clade) not in roots: roots.append(clade) else: - root.append(clade) -# if qPrint: print(", ".join(root[-1])) -# if qPrint: print("") - return root + roots.append(clade) + return roots, nSupport def PlotTree(speciesTree, treesDir, supported_clusters, qSimplePrint=True): c = Counter(supported_clusters) @@ -438,15 +434,16 @@ def GetRoot(speciesTreeFN, treesDir, GeneToSpeciesMap, nProcessors, treeFmt=None clusters = [] for l in list_of_lists: clusters.extend(l) - roots = list(set(ParsimonyRoot(species, dict_clades.keys(), clusters, qPrint=True))) + roots, nSupport = ParsimonyRoot(species, dict_clades.keys(), clusters) + roots = list(set(roots)) speciesTrees_rootedFNs =[] for i, r in enumerate(roots): - speciesTree = RootAtClade(speciesTree, r) - speciesTree_rootedFN = os.path.splitext(speciesTreeFN)[0] + "_%d_rooted.txt" % i + speciesTree = RootAtClade(speciesTree, r) + speciesTree_rootedFN = os.path.splitext(speciesTreeFN)[0] + "_%d_rooted.txt" % i # speciesTree = LabelNodes() - speciesTree.write(outfile=speciesTree_rootedFN, format=4) - speciesTrees_rootedFNs.append(speciesTree_rootedFN) - return roots, clusters, speciesTrees_rootedFNs + speciesTree.write(outfile=speciesTree_rootedFN, format=4) + speciesTrees_rootedFNs.append(speciesTree_rootedFN) + return roots, clusters, speciesTrees_rootedFNs, nSupport if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -485,11 +482,16 @@ if __name__ == "__main__": for fn in glob.glob(args.input_tree + "/*"): c = SupportedHierachies_wrapper(fn, GeneToSpecies, species, dict_clades, clade_names) clusters.extend(c) - ParsimonyRoot(species, dict_clades.keys(), clusters) + roots, nSupport = ParsimonyRoot(species, dict_clades.keys(), clusters) + if len(roots) > 1: print("Observed %d duplications. %d support the best roots and %d contradict them." % (len(clusters), nSupport, len(clusters) - nSupport)) + else: print("Observed %d duplications. %d support the best root and %d contradict it." % (len(clusters), nSupport, len(clusters) - nSupport)) + print("Best outgroup(s) for species tree:") + for r in roots: + print(r) if args.verbose: PlotTree(speciesTree, args.input_tree, clusters) else: root, clusters, _ = GetRoot(args.Species_tree, args.input_tree, GeneToSpecies, nProcs, treeFmt = 1) - for r in root: print(r) + for r in root: print(r) speciesTree = ete2.Tree(args.Species_tree, format=1) if args.verbose: PlotTree(speciesTree, args.input_tree, clusters, qSimplePrint=False) diff --git a/trees_for_orthogroups.py b/trees_for_orthogroups.py index da3f44444e18fe566f22c88c075fa67da8d223ee..9e85022bb2288542ecee8d2483614658804d39c1 100755 --- a/trees_for_orthogroups.py +++ b/trees_for_orthogroups.py @@ -103,7 +103,7 @@ def Worker_RunCommand(cmd_queue, nProcesses, nToDo, qHideStdout): i, commandSet = cmd_queue.get(True, 1) nDone = i - nProcesses + 1 if nDone >= 0 and divmod(nDone, 10 if nToDo <= 200 else 100 if nToDo <= 2000 else 1000)[1] == 0: - print("Done %d of %d" % (nDone, nToDo)) + orthofinder.util.PrintTime("Done %d of %d" % (nDone, nToDo)) RunCommandSet(commandSet, qHideStdout) except Queue.Empty: return