diff --git a/orthofinder/scripts/get_orthologues.py b/orthofinder/scripts/get_orthologues.py index 85786f60aea8ac39df7cbedb45b4bcd08cf76cfc..478c25e4edea6ea8ff5dfc4b5cf4e11329a8cce5 100755 --- a/orthofinder/scripts/get_orthologues.py +++ b/orthofinder/scripts/get_orthologues.py @@ -90,10 +90,11 @@ class OrthoGroupsSet(object): self._extractor = idExtractor self.clustersFN = clustersFilename_pairs self.seqIDsEx = None - self.ogs = self.OGs() + self.ogs = None self.speciesToUse = util.GetSpeciesToUse(self.speciesIDsFN) self.seqsInfo = util.GetSeqsInfo(orthofinderWorkingDir, self.speciesToUse) self.fileInfo = util.FileInfo(inputDir=orthofinderWorkingDir, outputDir = orthofinderWorkingDir, graphFilename="") + self.id_to_og = None def SequenceDict(self): if self.seqIDsEx == None: @@ -114,9 +115,18 @@ class OrthoGroupsSet(object): return self._Spec_SeqIDs def OGs(self): - ogs = MCL.GetPredictedOGs(self.clustersFN) - ogs = [[Seq(g) for g in og] for og in ogs if len(og) >= 4] - return ogs + if self.ogs != None: + return self.ogs + self.ogs = MCL.GetPredictedOGs(self.clustersFN) + self.ogs = [[Seq(g) for g in og] for og in self.ogs if len(og) >= 4] + return self.ogs + + def ID_to_OG_Dict(self): + if self.id_to_og != None: + return self.id_to_og + self.id_to_og = {g.ToString():iog for iog, og in enumerate(self.OGs()) for g in og} + return self.id_to_og + # ============================================================================================================================== @@ -144,14 +154,15 @@ def lil_max(M): # ASTRAL def GetOGsToUse(ogSet): - return range(100, min(10000, len(ogSet.ogs))) + return range(100, min(10000, len(ogSet.OGs()))) def CreateTaxaMapFile(ogSet, i_ogs_to_use, outputFN): """Get max number of sequences per species""" sp_max = defaultdict(int) + ogs = ogSet.OGs() for iog in i_ogs_to_use: thisCount = defaultdict(int) - for seq in ogSet.ogs[iog]: + for seq in ogs[iog]: thisCount[seq.iSp] += 1 for iSp in thisCount: sp_max[iSp] = max(sp_max[iSp], thisCount[iSp]) @@ -335,8 +346,9 @@ class DendroBLASTTrees(object): def PrepareGeneTreeCommand(self): cmds = [] - for iog in xrange(len(self.ogSet.ogs)): - nTaxa = len(self.ogSet.ogs[iog]) + ogs = self.ogSet.OGs() + for iog in xrange(len(ogs)): + nTaxa = len(ogs[iog]) cmds.append([" ".join(["fastme", "-i", self.distPat % iog, "-o", self.treesPatIDs % iog, "-w", "O"] + (["-s"] if nTaxa < 1000 else []))]) return cmds @@ -364,7 +376,7 @@ class DendroBLASTTrees(object): print( "-----------------------------------") util.RunParallelOrderedCommandLists(self.nProcesses, [[cmd_spTree]] + cmds_geneTrees, qHideStdout = True) seqDict = self.ogSet.Spec_SeqDict() - for iog in xrange(len(self.ogSet.ogs)): + for iog in xrange(len(self.ogSet.OGs())): self.RenameTreeTaxa(self.treesPatIDs % iog, self.treesPat % iog, seqDict, qFixNegatives=True) self.RenameTreeTaxa(spTreeFN_ids, self.workingDir + "SpeciesTree_unrooted.txt", self.ogSet.SpeciesDict(), qFixNegatives=True) #spTreeFN_ids = RunAstral(self.ogSet, self.treesPatIDs, self.workingDir) diff --git a/orthofinder/scripts/orthologues_from_recon_trees.py b/orthofinder/scripts/orthologues_from_recon_trees.py index 840eff48364d6715d3542e79646c557e0261f43c..dd3948735f25ea5d9e95f4aff32c5ea19e5a64b2 100644 --- a/orthofinder/scripts/orthologues_from_recon_trees.py +++ b/orthofinder/scripts/orthologues_from_recon_trees.py @@ -109,17 +109,21 @@ def multiply(specmin, specmax, matrixDir): product = M.dot(M.transpose()) return product, M -def WriteOrthologues(resultsDir, spec1, spec2, orthologues, speciesDict, sequenceDict): +def WriteOrthologues(resultsDir, spec1, spec2, orthologues, ogSet): + speciesDict = ogSet.SpeciesDict() + id_to_og = ogSet.ID_to_OG_Dict() + sequenceDict = ogSet.SequenceDict() d1 = resultsDir + "Orthologues_" + speciesDict[str(spec1)] + "/" d2 = resultsDir + "Orthologues_" + speciesDict[str(spec2)] + "/" with open(d1 + '%s__v__%s.csv' % (speciesDict[str(spec1)], speciesDict[str(spec2)]), 'wb') as outfile1, open(d2 + '%s__v__%s.csv' % (speciesDict[str(spec2)], speciesDict[str(spec1)]), 'wb') as outfile2: writer1 = csv.writer(outfile1) writer2 = csv.writer(outfile2) - writer1.writerow((speciesDict[str(spec1)], speciesDict[str(spec2)])) - writer2.writerow((speciesDict[str(spec2)], speciesDict[str(spec1)])) + writer1.writerow(("Orthogroup", speciesDict[str(spec1)], speciesDict[str(spec2)])) + writer2.writerow(("Orthogroup", speciesDict[str(spec2)], speciesDict[str(spec1)])) for genes1, genes2 in orthologues: - writer1.writerow((", ".join([sequenceDict["%d_%d" % (spec1, o)] for o in genes1]), ", ".join([sequenceDict["%d_%d" % (spec2, o)] for o in genes2]))) - writer2.writerow((", ".join([sequenceDict["%d_%d" % (spec2, o)] for o in genes2]), ", ".join([sequenceDict["%d_%d" % (spec1, o)] for o in genes1]))) + og = "OG%07d" % id_to_og["%d_%d" % (spec1, genes1[0])] + writer1.writerow((og, ", ".join([sequenceDict["%d_%d" % (spec1, o)] for o in genes1]), ", ".join([sequenceDict["%d_%d" % (spec2, o)] for o in genes2]))) + writer2.writerow((og, ", ".join([sequenceDict["%d_%d" % (spec2, o)] for o in genes2]), ", ".join([sequenceDict["%d_%d" % (spec1, o)] for o in genes1]))) def GetOrthologues(orig_matrix, orig_matrix_csc, index): orthologues = orig_matrix.getrowview(index).nonzero()[1] @@ -140,7 +144,8 @@ def find_all(matrix, orig_matrix): orthologues.append((orthologuesSp1, orthologuesSp2)) return orthologues -def species_find_all(speciesDict, sequenceDict, matrixDir, resultsDir): +def species_find_all(ogSet, matrixDir, resultsDir): + speciesDict = ogSet.SpeciesDict() # Calls multiply and find_all on each species pair, and appends the numbers from find_all's output to the relevant csv lists. speciesIDs = sorted(map(int, speciesDict.keys())) nspecies = len(speciesIDs) @@ -151,7 +156,7 @@ def species_find_all(speciesDict, sequenceDict, matrixDir, resultsDir): if index1 >= index2: continue product, M = multiply(index1, index2, matrixDir) orthologues = find_all(product, M) - WriteOrthologues(resultsDir, speciesIDs[index2], speciesIDs[index1], orthologues, speciesDict, sequenceDict) + WriteOrthologues(resultsDir, speciesIDs[index2], speciesIDs[index1], orthologues, ogSet) def get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, workingDir): # -> Matrices @@ -164,6 +169,6 @@ def get_orthologue_lists(ogSet, resultsDir, dlcparResultsDir, workingDir): one_to_one_efficient(orthodict, genenumbers, speciesLabels, iSpecies, matrixDir) # -> csv files - species_find_all(ogSet.SpeciesDict(), ogSet.SequenceDict(), matrixDir, resultsDir) + species_find_all(ogSet, matrixDir, resultsDir)