diff --git a/orthofinder/orthofinder.py b/orthofinder/orthofinder.py index 46149b011fca000e56f722ee1042985fa5d9cad4..2edc6d4c07c80bdadbf3c231ac875f1253955848 100755 --- a/orthofinder/orthofinder.py +++ b/orthofinder/orthofinder.py @@ -479,7 +479,7 @@ def WriteGraph_perSpecies(args): class WaterfallMethod: @staticmethod - def NormaliseScores(B, Lengths, iSpecies, jSpecies): + def NormaliseScores(B, Lengths, iSpecies, jSpecies): Li, Lj, scores = scnorm.GetLengthArraysForMatrix(B, Lengths[iSpecies], Lengths[jSpecies]) Lf = Li * Lj topLf, topScores = scnorm.GetTopPercentileOfScores(Lf, scores, 95) @@ -491,7 +491,7 @@ class WaterfallMethod: return sparse.lil_matrix(B.get_shape()) @staticmethod - def ProcessBlastHits(seqsInfo, blastDir_list, Lengths, iSpecies, qDoubleBlast): + def ProcessBlastHits(seqsInfo, blastDir_list, Lengths, iSpecies, qDoubleBlast, qNormalize): with warnings.catch_warnings(): warnings.simplefilter("ignore") # process up to the best hits for each species; Bi is a list of matrices @@ -500,6 +500,8 @@ class WaterfallMethod: # getting Bit scores matrix for iSpecies on jSpecies Bij = blast_file_processor.GetBLAST6Scores(seqsInfo, blastDir_list, seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies], qDoubleBlast=qDoubleBlast) Bij = WaterfallMethod.NormaliseScores(Bij, Lengths, iSpecies, jSpecies) + if qNormalize: + WaterfallMethod.ProcessNewScores(seqsInfo, blastDir_list, Bij, iSpecies, qDoubleBlast=qDoubleBlast) Bi.append(Bij) matrices.DumpMatrixArray("B", Bi, iSpecies) BH = GetBH_s(Bi, seqsInfo, iSpecies) @@ -507,13 +509,21 @@ class WaterfallMethod: util.PrintTime("Initial processing of species %d complete" % iSpecies) @staticmethod - def Worker_ProcessBlastHits(cmd_queue, qDoubleBlast): + def Worker_ProcessBlastHits(cmd_queue, qDoubleBlast, qNormalize=False): while True: try: args = cmd_queue.get(True, 1) - WaterfallMethod.ProcessBlastHits(*args, qDoubleBlast=qDoubleBlast) + WaterfallMethod.ProcessBlastHits(*args, qDoubleBlast=qDoubleBlast, qNormalize=qNormalize) except queue.Empty: - return + return + + @staticmethod + def ProcessNewScores(seqsInfo, blastDir_list, B, iSpecies, qDoubleBlast): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + # process up to the best hits for each species; Bi is a list of matrices + for jSpecies in range(seqsInfo.nSpecies): + blast_file_processor.rewriteNormalizedBlastFile(blastDir_list, B, seqsInfo.speciesToUse[iSpecies], seqsInfo.speciesToUse[jSpecies], qDoubleBlast=qDoubleBlast) @staticmethod def ConnectCognates(seqsInfo, iSpecies): @@ -1330,7 +1340,7 @@ def DoOrthogroups(options, speciesInfoObj, seqsInfo): blastDir_list = files.FileHandler.GetBlastResultsDir() for iSpecies in range(seqsInfo.nSpecies): cmd_queue.put((seqsInfo, blastDir_list, Lengths, iSpecies)) - runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast)) for i_ in range(options.nProcessAlg)] + runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast, options.qNormalize)) for i_ in range(options.nProcessAlg)] for proc in runningProcesses: proc.start() parallel_task_manager.ManageQueue(runningProcesses, cmd_queue) @@ -1380,7 +1390,7 @@ def DoNormalize(options, speciesInfoObj, seqsInfo): """ DoNormalize is a lighweigth version of the DoOrthogroups function It will just normalize scores from a blast output - and write back blast results to *.norm.txt.gz + and write back blast results to *.norm.blastout """ # Run Algorithm, cluster and output cluster files with original accessions util.PrintUnderline("Running OrthoFinder algorithm") @@ -1392,7 +1402,7 @@ def DoNormalize(options, speciesInfoObj, seqsInfo): for iSpecies in range(seqsInfo.nSpecies): cmd_queue.put((seqsInfo, blastDir_list, Lengths, iSpecies)) # Normalization is done here - runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast)) for i_ in range(options.nProcessAlg)] + runningProcesses = [mp.Process(target=WaterfallMethod.Worker_ProcessBlastHits, args=(cmd_queue, options.qDoubleBlast, options.qNormalize=True)) for i_ in range(options.nProcessAlg)] for proc in runningProcesses: proc.start() parallel_task_manager.ManageQueue(runningProcesses, cmd_queue) diff --git a/orthofinder/scripts/blast_file_processor.py b/orthofinder/scripts/blast_file_processor.py index f64a441779b892c0b6a05bc2dfbb497bf7eb3e93..84e5de95c752f9971599793806c989f91db12c37 100644 --- a/orthofinder/scripts/blast_file_processor.py +++ b/orthofinder/scripts/blast_file_processor.py @@ -31,9 +31,14 @@ import gzip from scipy import sparse from . import util +from . import matrices +from tempfile import NamedTemporaryFile +from shutil import move PY2 = sys.version_info <= (3,) file_read_mode = 'rb' if PY2 else 'rt' +#file_write_mode = 'wb+' if PY2 else 'wt+' +file_write_mode = 'wb+' def GetBLAST6Scores(seqsInfo, blastDir_list, iSpecies, jSpecies, qExcludeSelfHits = True, sep = "_", qDoubleBlast=True): qSameSpecies = iSpecies==jSpecies @@ -47,7 +52,7 @@ def GetBLAST6Scores(seqsInfo, blastDir_list, iSpecies, jSpecies, qExcludeSelfHit iH = 0 iSpeciesOpen = jSpecies jSpeciesOpen = iSpecies - else: + else: iQ = 0 iH = 1 iSpeciesOpen = iSpecies @@ -96,4 +101,52 @@ def GetBLAST6Scores(seqsInfo, blastDir_list, iSpecies, jSpecies, qExcludeSelfHit sys.stderr.write("Malformatted line in %sBlast%d_%d.txt\nOffending line was:\n" % (d, iSpecies, jSpecies)) sys.stderr.write("\t".join(row) + "\n") raise - return B \ No newline at end of file + return B + +def rewriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, qExcludeSelfHits = True, sep = "_", qDoubleBlast=True): + qSameSpecies = iSpecies==jSpecies + qCheckForSelfHits = qExcludeSelfHits and qSameSpecies + if not qDoubleBlast: + qRev = (iSpecies > jSpecies) + else: + qRev = False + if qRev: + iQ = 1 + iH = 0 + iSpeciesOpen = jSpecies + jSpeciesOpen = iSpecies + else: + iQ = 0 + iH = 1 + iSpeciesOpen = iSpecies + jSpeciesOpen = jSpecies + + tempfile = NamedTemporaryFile(mode='w', delete=False) + #xSeqID, ySeqID, NormVal = ParseMatrix(B) + for d in blastDir_list: + fn = d + "Blast%d_%d.txt" % (iSpeciesOpen, jSpeciesOpen) + fnNorm = d + "Blast%d_%d.norm.blastout" % (iSpeciesOpen, jSpeciesOpen) + if os.path.exists(fn) or os.path.exists(fn + ".gz"): break + try: + with (gzip.open(fn + ".gz", file_reader_mode) if os.path.exists(fn + ".gz") else open(fn, file_reader_mode)) as blastfile: + blastreader = csv.reader(blastfile, delimiter='\t') + blastwriter = csv.writer(tempfile, delimiter='\t') + for row in blastreader: + # Get hit and query IDs + try: + sequence1ID = int(row[iQ].split(sep, 2)[1]) + sequence2ID = int(row[iH].split(sep, 2)[1]) + except (IndexError, ValueError): + sys.stderr.write("\nERROR: Query or hit sequence ID in BLAST results file was missing or incorrectly formatted.\n") + raise + try: + score = float(row[11]) + except (IndexError, ValueError): + sys.stderr.write("\nERROR: 12th field in BLAST results file line should be the bit-score for the hit\n") + raise + newscore = B[sequence1ID, sequence2ID] + newrow = row + newrow[11] = newscore + writer.writerow(row) + + shutil.move(tempfile.name, fnNorm) \ No newline at end of file diff --git a/orthofinder/scripts/matrices.py b/orthofinder/scripts/matrices.py index e7bec1b8cce56ced7a4e8f2d9a4cdb3a354717db..11c0bb7b82fccf750ca4fc43b154f22de34f1015 100644 --- a/orthofinder/scripts/matrices.py +++ b/orthofinder/scripts/matrices.py @@ -75,4 +75,19 @@ def DeleteMatrices(baseName): def sparse_max_row(csr_mat): ret = np.zeros(csr_mat.shape[0]) ret[np.diff(csr_mat.indptr) != 0] = np.maximum.reduceat(csr_mat.data,csr_mat.indptr[:-1][np.diff(csr_mat.indptr)>0]) - return ret \ No newline at end of file + return ret + +def ParseMatrix(mat): + bdok = mat.todok() + SortedMat = sorted(bdok.items()) + NormVal=[] + xSeqID=[] + ySeqID=[] + for i in range(bdok.size-1): + xSeqID.append(SortedMat[i][0][0]) + ySeqID.append(SortedMat[i][0][1]) + # normalized bit scores for species ij + NormVal.append(SortedMat[i][1]) + #for i in range(len(Nval)): + # print("iSpeciesID_%i\tjSpeciesID_%i ... %f" % (xSeqID[i], ySeqID[i], NormVal[i])) + return xSeqID, ySeqID, NormVal \ No newline at end of file