From fe6cb24c17fb3ba004831553ee5d1d4bfb619ed0 Mon Sep 17 00:00:00 2001 From: David Emms <david_emms@hotmail.com> Date: Fri, 19 Aug 2016 14:18:06 +0100 Subject: [PATCH] Tidy up output --- get_orthologues.py | 27 ++++++++++++++++++++------- process_trees.py | 21 ++------------------- root_from_duplications.py | 4 ++-- trees_for_orthogroups.py | 17 ++++++++++------- 4 files changed, 34 insertions(+), 35 deletions(-) diff --git a/get_orthologues.py b/get_orthologues.py index 78ff58b..47a14b3 100755 --- a/get_orthologues.py +++ b/get_orthologues.py @@ -244,14 +244,13 @@ class DendroBLASTTrees(object): nSeqs = self.NumberOfSequences(self.species) ogMatrices = [np.zeros((n, n)) for n in nGenes] for iiSp, sp1 in enumerate(self.species): - orthofinder.util.PrintTime(str(sp1)) + orthofinder.util.PrintTime("Species %d" % sp1) Bs = [orthofinder.LoadMatrix("Bit", self.ogSet.fileInfo, iiSp, jjSp) for jjSp in xrange(len(self.species))] mins = np.ones((nSeqs[sp1], 1), dtype=np.float64)*9e99 maxes = np.zeros((nSeqs[sp1], 1), dtype=np.float64) for B, sp2 in zip(Bs, self.species): mins = np.minimum(mins, lil_min(B)) maxes = np.maximum(maxes, lil_max(B)) - orthofinder.util.PrintTime("got mins and maxes") for jjSp, B in enumerate(Bs): for og, m in zip(ogsPerSpecies, ogMatrices): for gi, i in og[iiSp]: @@ -344,6 +343,8 @@ class DendroBLASTTrees(object): D, spPairs = self.SpeciesTreeDistances(ogs, ogMatrices) cmd_spTree, spTreeFN_ids = self.PrepareSpeciesTreeCommand(D, spPairs) cmds_geneTrees = self.PrepareGeneTreeCommand() + print("\n3. Inferring gene and species trees") + print( "-----------------------------------") tfo.RunParallelCommandSets(nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout = True) seqDict = self.ogSet.Spec_SeqDict() for iog in xrange(len(self.ogSet.ogs)): @@ -509,34 +510,46 @@ if __name__ == "__main__": userDir = arg # Check arguments + print("0. Getting Orthologues") + print("----------------------") orthofinderWorkingDir, orthofinderResultsDir, clustersFilename_pairs = tfo.GetOGsFile(userDir) if nProcesses == None: - print("""Number of parallel processes has not been specified, will use the default value. - Number of parallel processes can be specified using the -t option\n""") + print("""\nNumber of parallel processes has not been specified, will use the default value. +Number of parallel processes can be specified using the -t option.""") nProcesses = orthofinder.nThreadsDefault - print("Using %d threads for alignments and trees\n" % nProcesses) + print("Using %d threads for alignments and trees" % nProcesses) + print("\n1. Checking required programs are installed") + print( "-------------------------------------------") if not CanRunDependencies(orthofinderWorkingDir): orthofinder.Fail() + print("\n2. Reading sequence similarity scores") + print( "-------------------------------------") resultsDir = orthofinder.util.CreateNewWorkingDirectory(orthofinderResultsDir + "Orthologues_") ogSet = OrthoGroupsSet(orthofinderWorkingDir, clustersFilename_pairs, idExtractor = idextractor.FirstWordExtractor) db = DendroBLASTTrees(ogSet, resultsDir) db.ReadAndPickle() nOGs, D, spPairs, spTreeFN_ids = db.RunAnalysis() - + + print("\n4. Best outgroup(s) for species tree") + print( "------------------------------------") roots, clusters, rootedSpeciesTreeFN = rfd.GetRoot(spTreeFN_ids, os.path.split(db.treesPatIDs)[0] + "/", rfd.GeneToSpecies_dash, nProcesses, treeFmt = 1) db.RenameTreeTaxa(rootedSpeciesTreeFN, resultsDir + "SpeciesTree_rooted.txt", db.ogSet.SpeciesDict(), qFixNegatives=True) spDict = ogSet.SpeciesDict() for r in roots: - print([spDict[s] for s in r]) + print(", ".join([spDict[s] for s in r])) # speciesTreeFN = RunAstral(ogSet, db.treesPat, resultsDir) + print("\n5. Reconciling gene and species trees") + print( "-------------------------------------") dlcparResultsDir = RunDlcpar(db.treesPatIDs, ogSet, nOGs, rootedSpeciesTreeFN, db.workingDir) # Orthologue lists + print("\n6. Inferring orthologues from gene trees") + print( "----------------------------------------") pt.get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, db.workingDir) print("\nDone!") diff --git a/process_trees.py b/process_trees.py index f501605..8deac33 100644 --- a/process_trees.py +++ b/process_trees.py @@ -86,7 +86,7 @@ def multiply(specmin, specmax, matrixDir): product = M.dot(M.transpose()) return product, M -def WriteOrthologues(folder, spec1, spec2, orthologues, speciesDict, sequenceDict): +def WriteOrthologues(resultsDir, spec1, spec2, orthologues, speciesDict, sequenceDict): d1 = resultsDir + "Orthologues_" + speciesDict[str(spec1)] + "/" d2 = resultsDir + "Orthologues_" + speciesDict[str(spec2)] + "/" with open(d1 + '%s__v__%s.csv' % (speciesDict[str(spec1)], speciesDict[str(spec2)]), 'wb') as outfile1, open(d2 + '%s__v__%s.csv' % (speciesDict[str(spec2)], speciesDict[str(spec1)]), 'wb') as outfile2: @@ -98,13 +98,6 @@ def WriteOrthologues(folder, spec1, spec2, orthologues, speciesDict, sequenceDic writer1.writerow((", ".join([sequenceDict["%d_%d" % (spec1, o)] for o in genes1]), ", ".join([sequenceDict["%d_%d" % (spec2, o)] for o in genes2]))) writer2.writerow((", ".join([sequenceDict["%d_%d" % (spec2, o)] for o in genes2]), ", ".join([sequenceDict["%d_%d" % (spec1, o)] for o in genes1]))) -#def WriteOrthologues_OneFile(folder, spec1, speciesIDs, orthologues, speciesDict, sequenceDict): -# with open(folder + 'Orthologues_%s.csv' % speciesDict[str(spec1)], 'wb') as outfile: -# writer = csv.writer(outfile) -# writer.writerow([speciesDict[str(spec1)]] + [speciesDict[str(j)] for j in speciesIDs if j != spec1]) -# """Need to be able to go through the genes in turn for a single speices and get the ortholgues of that gene""" -# """Note that if other genes are in the same row then they aren't necessarily orthologues""" - def GetOrthologues(orig_matrix, orig_matrix_csc, index): orthologues = orig_matrix.getrowview(index).nonzero()[1] index = orthologues[0] @@ -123,12 +116,6 @@ def find_all(matrix, orig_matrix): done.update(orthologuesSp1) orthologues.append((orthologuesSp1, orthologuesSp2)) return orthologues - -#def AllOrthologuesForSpecies(index1, speciesIDs, matrixDir): -# orthologues = [] -# for index2 in xrange(len(speciesIDs)): -# product, M = multiply(index1, index2, matrixDir) -# orthologues.append(find_all(product, M)) def species_find_all(speciesDict, sequenceDict, matrixDir, resultsDir): # Calls multiply and find_all on each species pair, and appends the numbers from find_all's output to the relevant csv lists. @@ -140,12 +127,8 @@ def species_find_all(speciesDict, sequenceDict, matrixDir, resultsDir): for index1, index2 in itertools.product(xrange(nspecies), xrange(nspecies)): if index1 >= index2: continue product, M = multiply(index1, index2, matrixDir) - print("Ortholgues for species pair (%d, %d)" % (index2, index1)) orthologues = find_all(product, M) - WriteOrthologues(resultsDir, speciesIDs[index2], speciesIDs[index1], orthologues, speciesDict, sequenceDict) -# for index1 in xrange(nspecies): -# orthologues = AllOrthologuesForSpecies(index1, speciesIDs, matrixDir) -# WriteOrthologues_OneFile(resultsDir, speciesIDs, speciesIDs[index1], orthologues, speciesDict, sequenceDict) + WriteOrthologues(resultsDir, speciesIDs[index2], speciesIDs[index1], orthologues, speciesDict, sequenceDict) def get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, workingDir): # -> Matrices diff --git a/root_from_duplications.py b/root_from_duplications.py index cdacf7a..3b3416d 100644 --- a/root_from_duplications.py +++ b/root_from_duplications.py @@ -395,10 +395,10 @@ def ParsimonyRoot(allSpecies, clades, supported_clusters, qPrint=True): if score == m: if len(clade) > len(allSpecies)/2: root.append(allSpecies.difference(clade)) - if qPrint: print(root[-1]) + if qPrint: print(", ".join(root[-1])) else: root.append(clade) - if qPrint: print(root[-1]) + if qPrint: print(", ".join(root[-1])) if qPrint: print("") return root diff --git a/trees_for_orthogroups.py b/trees_for_orthogroups.py index a3c1dbf..5bf2912 100755 --- a/trees_for_orthogroups.py +++ b/trees_for_orthogroups.py @@ -43,14 +43,14 @@ import orthofinder version = "0.6.1" def RunCommandSet(commandSet, qHideStdout): - orthofinder.util.PrintTime("Runing command: %s" % commandSet[-1]) +# orthofinder.util.PrintTime("Runing command: %s" % commandSet[-1]) if qHideStdout: for cmd in commandSet: subprocess.call(cmd, shell=True, stdout=subprocess.PIPE) else: for cmd in commandSet: subprocess.call(cmd, shell=True) - orthofinder.util.PrintTime("Finshed command: %s" % commandSet[-1]) +# orthofinder.util.PrintTime("Finshed command: %s" % commandSet[-1]) class FastaWriter(object): def __init__(self, fastaFileDir): @@ -92,7 +92,7 @@ class FastaWriter(object): def SortSeqs(self, seqs): return sorted(seqs, key=lambda x: map(int, x.split("_"))) -def Worker_RunCommand(cmd_queue, qHideStdout): +def Worker_RunCommand(cmd_queue, nProcesses, nToDo, qHideStdout): """ repeatedly takes items to process from the queue until it is empty at which point it returns. Does not take a new task if it can't acquire queueLock as this indicates the queue is being rearranged. @@ -100,7 +100,10 @@ def Worker_RunCommand(cmd_queue, qHideStdout): """ while True: try: - commandSet = cmd_queue.get(True, 1) + i, commandSet = cmd_queue.get(True, 1) + nDone = i - nProcesses + 1 + if nDone >= 0 and divmod(nDone, 100)[1] == 0: + print("Done %d of %d" % (nDone, nToDo)) RunCommandSet(commandSet, qHideStdout) except Queue.Empty: return @@ -112,9 +115,9 @@ def RunParallelCommandSets(nProcesses, commands, qHideStdout = False): """ # Setup the workers and run cmd_queue = mp.Queue() - for cmd in commands: - cmd_queue.put(cmd) - runningProcesses = [mp.Process(target=Worker_RunCommand, args=(cmd_queue, qHideStdout)) for i_ in xrange(nProcesses)] + for i, cmd in enumerate(commands): + cmd_queue.put((i, cmd)) + runningProcesses = [mp.Process(target=Worker_RunCommand, args=(cmd_queue, nProcesses, i+1, qHideStdout)) for i_ in xrange(nProcesses)] for proc in runningProcesses: proc.start() -- GitLab