diff --git a/orthofinder/orthofinder.py b/orthofinder/orthofinder.py index d4e3ee0fdc321dbb7fc880317a4d2ae7d4744723..f04594ba00c66c46d31934975a8c903cb5cc5c4c 100755 --- a/orthofinder/orthofinder.py +++ b/orthofinder/orthofinder.py @@ -449,18 +449,18 @@ class WaterfallMethod: Bi.append(Bij) if qNormalizeAndFixZeros: for jSpecies in range(seqsInfo.nSpecies): - WaterfallMethod.WriteFixNewScores(seqsInfo, blastDir_list, Bi[jSpecies], seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies], qDoubleBlast=qDoubleBlast) + WaterfallMethod.WriteFixNewScores(seqsInfo, blastDir_list, seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies], qDoubleBlast=qDoubleBlast) matrices.DumpMatrixArray("B", Bi, iSpecies) BH = GetBH_s(Bi, seqsInfo, iSpecies) matrices.DumpMatrixArray("BH", BH, iSpecies) util.PrintTime("Initial processing of species %d complete" % iSpecies) @staticmethod - def Worker_ProcessBlastHits(cmd_queue, qDoubleBlast, qNormalize=False): + def Worker_ProcessBlastHits(cmd_queue, qDoubleBlast, qNormalize=False, qNormalizeAndFixZeros=False): while True: try: args = cmd_queue.get(True, 1) - WaterfallMethod.ProcessBlastHits(*args, qDoubleBlast=qDoubleBlast, qNormalize=qNormalize) + WaterfallMethod.ProcessBlastHits(*args, qDoubleBlast=qDoubleBlast, qNormalize=qNormalize, qNormalizeAndFixZeros=qNormalizeAndFixZeros) except queue.Empty: return @@ -472,11 +472,10 @@ class WaterfallMethod: blast_file_processor.WriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, qDoubleBlast=qDoubleBlast) @staticmethod - def WriteFixNewScores(seqsInfo, blastDir_list, B, iSpecies, jSpecies, qDoubleBlast): + def WriteFixNewScores(seqsInfo, blastDir_list, iSpecies, jSpecies, qDoubleBlast): with warnings.catch_warnings(): warnings.simplefilter("ignore") - # process up to the best hits for each species; Bi is a list of matrices - blast_file_processor.WriteFixNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, qDoubleBlast=qDoubleBlast) + blast_file_processor.WriteFixNormalizedBlastFile(blastDir_list, iSpecies, jSpecies, qDoubleBlast=qDoubleBlast) @staticmethod def ConnectCognates(seqsInfo, iSpecies): @@ -1149,7 +1148,7 @@ def ProcessArgs(prog_caller): # check argument combinations if not (options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees or options.qNormalize): - print("ERROR: Please specify the input directory for OrthoFinder using one of the options: '-f', '-b', '-N', '-fg' or '-ft'.") + print("ERROR: Please specify the input directory for OrthoFinder using one of the options: '-f', '-b', '-N', '-NF', '-fg' or '-ft'.") util.Fail() if options.qStartFromFasta and (options.qStartFromTrees or options.qStartFromGroups): @@ -1192,12 +1191,12 @@ def ProcessArgs(prog_caller): print("ERROR: Search program (%s) not configured in config.json file" % options.search_program) util.Fail() - if options.qNormalize and ( options.qNormalizeAndFixZeros or options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees ) : + if options.qNormalize and ( options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees ) : # not a failure/exit, just a warning print("ERROR: --normalize option is not possible with other options") util.Fail() - if options.qNormalizeAndFixZeros and ( options.qNormalize or options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees ) : + if options.qNormalizeAndFixZeros and ( options.qStartFromFasta or options.qStartFromBlast or options.qStartFromGroups or options.qStartFromTrees ) : # not a failure/exit, just a warning print("ERROR: --normalize-fix option is not possible with other options") util.Fail() diff --git a/orthofinder/scripts/blast_file_processor.py b/orthofinder/scripts/blast_file_processor.py index 4be8c4c6b4096b0ae5badae1e6e18b791ef50f71..ea41e9fb4b23b3a327fd93b90d862fe2a400e2d6 100644 --- a/orthofinder/scripts/blast_file_processor.py +++ b/orthofinder/scripts/blast_file_processor.py @@ -41,6 +41,24 @@ file_read_mode = 'rb' if PY2 else 'rt' #file_write_mode = 'wb+' if PY2 else 'wt+' file_write_mode = 'wb+' +def GetSpeciesOrder(iSpecies, jSpecies, qDoubleBlast): + # checking species order to get the right filenames + if not qDoubleBlast: + qRev = (iSpecies > jSpecies) + else: + qRev = False + if qRev: + iQ = 1 + iH = 0 + iSpeciesOpen = jSpecies + jSpeciesOpen = iSpecies + else: + iQ = 0 + iH = 1 + iSpeciesOpen = iSpecies + jSpeciesOpen = jSpecies + return iSpeciesOpen, jSpeciesOpen, iQ, iH + def GetBLAST6Scores(seqsInfo, blastDir_list, iSpecies, jSpecies, qExcludeSelfHits = True, sep = "_", qDoubleBlast=True): qSameSpecies = iSpecies==jSpecies qCheckForSelfHits = qExcludeSelfHits and qSameSpecies @@ -118,46 +136,31 @@ def WriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", qD sys.stderr.write("\t".join(map(str, row)) + "\n") raise -def WriteFixNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", qDoubleBlast=True): +def WriteFixNormalizedBlastFile(blastDir_list, iSpecies, jSpecies, sep = "_", qDoubleBlast=True): iSpeciesOpen, jSpeciesOpen, iQ, iH = GetSpeciesOrder(iSpecies, jSpecies, qDoubleBlast) row = "" rows = [] - for d in blastDir_list: - fn = d + "Blast%d_%d.txt" % (iSpeciesOpen, jSpeciesOpen) - fnNorm = d + "Blast%d_%d.norm.blastout" % (iSpeciesOpen, jSpeciesOpen) - if os.path.exists(fn) or os.path.exists(fn + ".gz"): break - try: - with (gzip.open(fn + ".gz", file_read_mode) if os.path.exists(fn + ".gz") else open(fn, file_read_mode)) as blastfile: - blastreader = csv.reader(blastfile, delimiter='\t') - tempfile = NamedTemporaryFile(mode='w', delete=False) - blastwriter = csv.writer(tempfile, delimiter='\t') - for row in blastreader: - rows.append(row) - newrows = sc.scnorm.FixZeroNormalizedScores(iSpeciesOpen, jSpeciesOpen, iQ, iH, sep, B, rows) - for the_row in newrows: - blastwriter.writerow(the_row) - tempfile.flush() - tempfile.close() - move(tempfile.name, fnNorm) - except Exception: - sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (d, iSpecies, jSpecies)) - sys.stderr.write("\t".join(map(str, row)) + "\n") - raise + # 0.0 normalized value should only happen for self-hits + if iSpeciesOpen == jSpeciesOpen: + for d in blastDir_list: + fnNorm = d + "Blast%d_%d.norm.blastout" % (iSpeciesOpen, jSpeciesOpen) + fnNormFix = d + "Blast%d_%d.norm.fix.blastout" % (iSpeciesOpen, jSpeciesOpen) + if os.path.exists(fnNorm): break + try: + with open(fnNorm, file_read_mode) as blastfile: + blastreader = csv.reader(blastfile, delimiter='\t') + tempfile = NamedTemporaryFile(mode='w', delete=False) + blastwriter = csv.writer(tempfile, delimiter='\t') + for row in blastreader: + rows.append(row) + newrows = sc.scnorm.FixZeroNormalizedScores(iSpeciesOpen, jSpeciesOpen, rows) + for the_row in newrows: + blastwriter.writerow(the_row) + tempfile.flush() + tempfile.close() + move(tempfile.name, fnNormFix) + except Exception: + sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (d, iSpecies, jSpecies)) + sys.stderr.write("\t".join(map(str, row)) + "\n") + raise -def GetSpeciesOrder(iSpecies, jSpecies, qDoubleBlast): - # checking species order to get the right filenames - if not qDoubleBlast: - qRev = (iSpecies > jSpecies) - else: - qRev = False - if qRev: - iQ = 1 - iH = 0 - iSpeciesOpen = jSpecies - jSpeciesOpen = iSpecies - else: - iQ = 0 - iH = 1 - iSpeciesOpen = iSpecies - jSpeciesOpen = jSpecies - return iSpeciesOpen, jSpeciesOpen, iQ, iH diff --git a/orthofinder/scripts/scnorm.py b/orthofinder/scripts/scnorm.py index ed4c385400f20c6886bca7e525256f44607d7a44..70bf908076f0bab11dabc21da91817ec73da2ba5 100644 --- a/orthofinder/scripts/scnorm.py +++ b/orthofinder/scripts/scnorm.py @@ -126,8 +126,8 @@ class scnorm: """ This function retrieve the Normalization Score from Matrix B from (Seq iQ, Seq iH) - If it is 0.0, it will try to retrieve - a modified value from the log10 fitting curve + and return the csv rows modified + by adding normalization score as a new column """ newrows = [] for row in rows: @@ -148,43 +148,35 @@ class scnorm: return newrows @staticmethod - def FixZeroNormalizedScores(iSpeciesOpen, jSpeciesOpen, iQ, iH, sep, B, rows): + def FixZeroNormalizedScores(iSpeciesOpen, jSpeciesOpen, rows): """ This function retrieve the Normalization Score - from Matrix B from (Seq iQ, Seq iH) + from current csv file (with Seq iQ, Seq iH) If it is 0.0 (self-hits), it will try to retrieve a modified value from the log10 fitting curve """ - NormBS = 0.0 + selfHitNormBlastScore = "0.0" + NormBS = selfHitNormBlastScore newrows = [] # retrieve value from plot https://code-examples.net/en/q/8863d1 - print(repr(iSpeciesOpen)) - print(repr(f)) - all_plots = f[0][jSpeciesOpen].get_xydata() + all_plots = f[jSpeciesOpen][0].get_xydata() + print(repr(all_plots)) + line = 0 for row in rows: - # Get hit and query IDs - try: - sequence1ID = int(row[iQ].split(sep, 2)[1]) - sequence2ID = int(row[iH].split(sep, 2)[1]) - except (IndexError, ValueError): - sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n") - raise try: - newscore = B[sequence1ID, sequence2ID] - if newscore == 0.0: - #sys.stderr.write("\nWARNING: 0.0 value detected in seq position %d %d for Blast species %d %d!\n" % (sequence1ID, sequence2ID, iSpeciesOpen, jSpeciesOpen)) + if row[12] == selfHitNormBlastScore: try: x = float(row[11]) for index, values in all_plots: if values == x: NormBS = index break - row.append(NormBS) + row[12] = NormBS except (IndexError, ValueError): sys.stderr.write("\nERROR: 12th field in BLAST results file line should be the bit-score for the hit\n") - row.append(newscore) except IndexError as e: - sys.stderr.write("\nERROR: Matrix index on Blast species %d %d, sequences B[%d, %d] : %s \n" % (iSpeciesOpen, jSpeciesOpen, sequence1ID, sequence2ID, e)) + sys.stderr.write("\nERROR: csv values on Blast species %d %d file, line %d : %s \n" % (iSpeciesOpen, jSpeciesOpen, line, e)) pass newrows.append(row) + line+=1 return newrows