diff --git a/orthofinder/orthofinder.py b/orthofinder/orthofinder.py index 02ee69440eb0002a2b7e7ce2c176874d598d1596..d4e3ee0fdc321dbb7fc880317a4d2ae7d4744723 100755 --- a/orthofinder/orthofinder.py +++ b/orthofinder/orthofinder.py @@ -435,7 +435,7 @@ class WaterfallMethod: return sparse.lil_matrix(B.get_shape()) @staticmethod - def ProcessBlastHits(seqsInfo, blastDir_list, Lengths, iSpecies, qDoubleBlast, qNormalize): + def ProcessBlastHits(seqsInfo, blastDir_list, Lengths, iSpecies, qDoubleBlast, qNormalize, qNormalizeAndFixZeros): with warnings.catch_warnings(): warnings.simplefilter("ignore") # process up to the best hits for each species; Bi is a list of matrices @@ -447,8 +447,8 @@ class WaterfallMethod: if qNormalize: WaterfallMethod.WriteNewScores(seqsInfo, blastDir_list, Bij, seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies], qDoubleBlast=qDoubleBlast) Bi.append(Bij) - for jSpecies in range(seqsInfo.nSpecies): - if qNormalize: + if qNormalizeAndFixZeros: + for jSpecies in range(seqsInfo.nSpecies): WaterfallMethod.WriteFixNewScores(seqsInfo, blastDir_list, Bi[jSpecies], seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies], qDoubleBlast=qDoubleBlast) matrices.DumpMatrixArray("B", Bi, iSpecies) BH = GetBH_s(Bi, seqsInfo, iSpecies) @@ -822,6 +822,7 @@ def PrintHelp(prog_caller): print("WORKFLOW RESTART COMMANDS:") print(" -b <dir> Start OrthoFinder from pre-computed BLAST results in <dir>") print(" -N <dir> Normalize Blast scores from pre-computed BLAST results in <dir>, then exit") + print(" -NF <dir> Normalize Blast scores from pre-computed BLAST results in <dir>, fix self-hits 0.0 values, then exit") print(" -fg <dir> Start OrthoFinder from pre-computed orthogroups in <dir>") print(" -ft <dir> Start OrthoFinder from pre-computed gene trees in <dir>") @@ -889,6 +890,7 @@ class Options(object):# self.speciesTreeFN = None self.mclInflation = g_mclInflation self.qNormalize = False # score normalization from Blast results. Already done when using qStartFromFasta. + self.qNormalizeAndFixZeros = False # score normalization from Blast results. Already done when using qStartFromFasta. def what(self): for k, v in self.__dict__.items(): @@ -950,6 +952,13 @@ def ProcessArgs(prog_caller): util.Fail() options.qNormalize = True continuationDir = GetDirectoryArgument(arg, args) + elif arg == "-NF" or arg == "--normalize-fix": + if options.qNormalizeAndFixZeros: + print("Repeated argument: -NF/--normalize-fix\n") + util.Fail() + options.qNormalize = True + options.qNormalizeAndFixZeros = True + continuationDir = GetDirectoryArgument(arg, args) elif arg == "-fg" or arg == "--from-groups": if options.qStartFromGroups: print("Repeated argument: -fg/--from-groups\n") @@ -1183,9 +1192,14 @@ def ProcessArgs(prog_caller): print("ERROR: Search program (%s) not configured in config.json file" % options.search_program) util.Fail() - if options.qNormalize and ( options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees ) : + if options.qNormalize and ( options.qNormalizeAndFixZeros or options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees ) : + # not a failure/exit, just a warning + print("ERROR: --normalize option is not possible with other options") + util.Fail() + + if options.qNormalizeAndFixZeros and ( options.qNormalize or options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees ) : # not a failure/exit, just a warning - print("ERROR: Normalize option is not possible with other options") + print("ERROR: --normalize-fix option is not possible with other options") util.Fail() util.PrintTime("Starting OrthoFinder") @@ -1294,7 +1308,7 @@ def DoOrthogroups(options, speciesInfoObj, seqsInfo): blastDir_list = files.FileHandler.GetBlastResultsDir() for iSpecies in range(seqsInfo.nSpecies): cmd_queue.put((seqsInfo, blastDir_list, Lengths, iSpecies)) - runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast, options.qNormalize)) for i_ in range(options.nProcessAlg)] + runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast, options.qNormalize, options.qNormalizeAndFixZeros)) for i_ in range(options.nProcessAlg)] for proc in runningProcesses: proc.start() parallel_task_manager.ManageQueue(runningProcesses, cmd_queue) @@ -1356,7 +1370,7 @@ def DoNormalize(options, speciesInfoObj, seqsInfo): for iSpecies in range(seqsInfo.nSpecies): cmd_queue.put((seqsInfo, blastDir_list, Lengths, iSpecies)) # Normalization is done here - runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast, options.qNormalize)) for i_ in range(options.nProcessAlg)] + runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast, options.qNormalize, options.qNormalizeAndFixZeros)) for i_ in range(options.nProcessAlg)] for proc in runningProcesses: proc.start() parallel_task_manager.ManageQueue(runningProcesses, cmd_queue) diff --git a/orthofinder/scripts/blast_file_processor.py b/orthofinder/scripts/blast_file_processor.py index bf20469a44677e878c6cc849ff478cd629d1bbc2..4be8c4c6b4096b0ae5badae1e6e18b791ef50f71 100644 --- a/orthofinder/scripts/blast_file_processor.py +++ b/orthofinder/scripts/blast_file_processor.py @@ -107,11 +107,9 @@ def WriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", qD blastwriter = csv.writer(tempfile, delimiter='\t') for row in blastreader: rows.append(row) - newrows = sc.scnorm.NormalizedScores(iSpeciesOpen, jSpeciesOpen, iQ, iH, sep, B, rows) for the_row in newrows: blastwriter.writerow(the_row) - tempfile.flush() tempfile.close() move(tempfile.name, fnNorm) @@ -135,11 +133,9 @@ def WriteFixNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", blastwriter = csv.writer(tempfile, delimiter='\t') for row in blastreader: rows.append(row) - newrows = sc.scnorm.FixZeroNormalizedScores(iSpeciesOpen, jSpeciesOpen, iQ, iH, sep, B, rows) for the_row in newrows: blastwriter.writerow(the_row) - tempfile.flush() tempfile.close() move(tempfile.name, fnNorm) diff --git a/orthofinder/scripts/scnorm.py b/orthofinder/scripts/scnorm.py index 91f67e497289d3a0cd1b0cf49ada07558679804d..ed4c385400f20c6886bca7e525256f44607d7a44 100644 --- a/orthofinder/scripts/scnorm.py +++ b/orthofinder/scripts/scnorm.py @@ -106,10 +106,11 @@ class scnorm: def DrawLogLinear(params, x): global counter global f - nbvals=10000 - maxBS=2500 + nbvals=70000 + maxBS=7000 minBS=0 x = np.linspace(minBS, maxBS, nbvals) + print("Drawing fitting curve") f.append(plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter))) plt.legend() plt.xlabel('Bits Scores') @@ -151,17 +152,15 @@ class scnorm: """ This function retrieve the Normalization Score from Matrix B from (Seq iQ, Seq iH) - If it is 0.0, it will try to retrieve + If it is 0.0 (self-hits), it will try to retrieve a modified value from the log10 fitting curve """ NormBS = 0.0 newrows = [] # retrieve value from plot https://code-examples.net/en/q/8863d1 print(repr(iSpeciesOpen)) - print(repr(f[0][iSpeciesOpen])) - print(repr(f[0])) print(repr(f)) - all_plots = f[0][iSpeciesOpen].get_xydata() + all_plots = f[0][jSpeciesOpen].get_xydata() for row in rows: # Get hit and query IDs try: @@ -173,7 +172,7 @@ class scnorm: try: newscore = B[sequence1ID, sequence2ID] if newscore == 0.0: - sys.stderr.write("\nWARNING: 0.0 value detected in seq position %d %d for Blast species %d %d!\n" % (sequence1ID, sequence2ID, iSpeciesOpen, jSpeciesOpen)) + #sys.stderr.write("\nWARNING: 0.0 value detected in seq position %d %d for Blast species %d %d!\n" % (sequence1ID, sequence2ID, iSpeciesOpen, jSpeciesOpen)) try: x = float(row[11]) for index, values in all_plots: