diff --git a/orthofinder/scripts/blast_file_processor.py b/orthofinder/scripts/blast_file_processor.py index 3627c29406a48349b488dfcb48ea0d96498568ae..0f7404e604296d91519c58e8e9ef045cf3415912 100644 --- a/orthofinder/scripts/blast_file_processor.py +++ b/orthofinder/scripts/blast_file_processor.py @@ -121,7 +121,9 @@ def WriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", qD iSpeciesOpen = iSpecies jSpeciesOpen = jSpecies #xSeqID, ySeqID, NormVal = ListFromMatrix(B) + row = "" + rows = [] for d in blastDir_list: fn = d + "Blast%d_%d.txt" % (iSpeciesOpen, jSpeciesOpen) fnNorm = d + "Blast%d_%d.norm.blastout" % (iSpeciesOpen, jSpeciesOpen) @@ -132,24 +134,12 @@ def WriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", qD tempfile = NamedTemporaryFile(mode='w', delete=False) blastwriter = csv.writer(tempfile, delimiter='\t') for row in blastreader: - # Get hit and query IDs - try: - sequence1ID = int(row[iQ].split(sep, 2)[1]) - sequence2ID = int(row[iH].split(sep, 2)[1]) - except (IndexError, ValueError): - sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n") - raise - try: - newscore = B[sequence1ID, sequence2ID] - if newscore == 0.0: - sys.stderr.write("\nERROR: 0.0 value detected in seq position %d %d for Blast species %d %d!\n" % (sequence1ID, sequence2ID, iSpeciesOpen, jSpeciesOpen)) + rows.append(row) + + newrows = sc.scnorm.FixZeroNormalizedScores(iSpeciesOpen, jSpeciesOpen, iQ, iH, sep, B, rows) + for the_row in newrows: + blastwriter.writerow(the_row) - # newscore = sc.scnorm.FixZeroNormalizedScores(iSpeciesOpen, float(row[11])) - row.append(newscore) - except IndexError as e: - sys.stderr.write("\nERROR: Matrix index on Blast species %d %d, sequences B[%d, %d] : %s \n" % (iSpeciesOpen, jSpeciesOpen, sequence1ID, sequence2ID, e)) - pass - blastwriter.writerow(row) tempfile.flush() tempfile.close() move(tempfile.name, fnNorm) diff --git a/orthofinder/scripts/scnorm.py b/orthofinder/scripts/scnorm.py index b6b041bc1056778366492c75e89345e7dddb74eb..4b482ab74b1ee9a220f84b8a5910f9cbc7517735 100644 --- a/orthofinder/scripts/scnorm.py +++ b/orthofinder/scripts/scnorm.py @@ -34,7 +34,6 @@ import matplotlib.pyplot as plt # install counter=0 -#curvefitplt=[] """ scnorm @@ -104,14 +103,13 @@ class scnorm: @staticmethod def DrawLogLinear(params, x): global counter - #global curvefitplt - nbvals=5000 + global f + nbvals=10000 maxBS=2500 minBS=0 x = np.linspace(minBS, maxBS, nbvals) plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter)) - #plot = plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter)) - #curvefitplt.append(plot) + f = plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter)) plt.legend() plt.xlabel('Bits Scores') plt.ylabel('Normalized Scores') @@ -120,8 +118,42 @@ class scnorm: plt.savefig('fitting_curve.png') counter+=1 - #@staticmethod - #def FixZeroNormalizedScores(species, x): - # https://code-examples.net/en/q/8863d1 - #NormBS = curvefitplt[species][int(x)].get_data() - #return NormBS + @staticmethod + def FixZeroNormalizedScores(iSpeciesOpen, jSpeciesOpen, iQ, iH, sep, B, rows): + """ + This function retrieve the Normalization Score + from Matrix B from (Seq iQ, Seq iH) + If it is 0.0, it will try to retrieve + a modified value from the log10 fitting curve + """ + NormBS = 0.0 + newrows = [] + # retrieve value from plot https://code-examples.net/en/q/8863d1 + all_plots = f[iSpeciesOpen].get_xydata() + for row in rows: + # Get hit and query IDs + try: + sequence1ID = int(row[iQ].split(sep, 2)[1]) + sequence2ID = int(row[iH].split(sep, 2)[1]) + except (IndexError, ValueError): + sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n") + raise + try: + newscore = B[sequence1ID, sequence2ID] + if newscore == 0.0: + sys.stderr.write("\nWARNING: 0.0 value detected in seq position %d %d for Blast species %d %d!\n" % (sequence1ID, sequence2ID, iSpeciesOpen, jSpeciesOpen)) + try: + x = float(row[11]) + for index, values in all_plots: + if values == x: + NormBS = index + break + row.append(NormBS) + except (IndexError, ValueError): + sys.stderr.write("\nERROR: 12th field in BLAST results file line should be the bit-score for the hit\n") + row.append(newscore) + except IndexError as e: + sys.stderr.write("\nERROR: Matrix index on Blast species %d %d, sequences B[%d, %d] : %s \n" % (iSpeciesOpen, jSpeciesOpen, sequence1ID, sequence2ID, e)) + pass + newrows.append(row) + return newrows