From c96bb13c34d4a2f97c0c0333018db058b5a625cf Mon Sep 17 00:00:00 2001 From: remy <remy.dernat@umontpellier.fr> Date: Sun, 24 Nov 2019 15:42:59 +0100 Subject: [PATCH] update code to load a new class from file --- orthofinder/orthofinder.py | 95 +-------------- orthofinder/scripts/blast_file_processor.py | 2 + orthofinder/scripts/scnorm.py | 124 ++++++++++++++++++++ 3 files changed, 131 insertions(+), 90 deletions(-) create mode 100644 orthofinder/scripts/scnorm.py diff --git a/orthofinder/orthofinder.py b/orthofinder/orthofinder.py index e75d2be..228c719 100755 --- a/orthofinder/orthofinder.py +++ b/orthofinder/orthofinder.py @@ -54,10 +54,6 @@ except ImportError: import Queue as queue # Y import warnings # Y -import matplotlib as mpl # install -mpl.use('Agg') -import matplotlib.pyplot as plt # install - PY2 = sys.version_info <= (3,) csv_write_mode = 'wb' if PY2 else 'wt' @@ -66,6 +62,7 @@ from scripts import blast_file_processor from scripts import util, matrices, orthologues, trees_msa from scripts import program_caller from scripts import files +from scripts.scnorm import scnorm as sc from scripts import parallel_task_manager # Get directory containing script/bundle @@ -74,7 +71,6 @@ if getattr(sys, 'frozen', False): else: __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -counter=0 max_int = sys.maxsize ok = False while not ok: @@ -296,87 +292,6 @@ class MCL: fileWriter_counts.writerow(row[:1] + counts_row + [sum(counts_row)]) thisOutputWriter.writerow(row) -""" -scnorm -------------------------------------------------------------------------------- -""" -class scnorm: - @staticmethod - def loglinear(x, a, b): - # return the loglinear function - return a*np.log10(x)+b - - @staticmethod - def GetLengthArraysForMatrix(m, len_i, len_j): - # return (I,j) positions non zeros values for the m matrix - I, J = m.nonzero() - scores = [v for row in m.data for v in row] # use fact that it's lil - Li = np.array(len_i[I]) - Lj = np.array(len_j[J]) - return Li, Lj, scores - - @staticmethod - def GetTopPercentileOfScores(L, S, percentileToKeep): - # Get the top x% of hits at each length - nScores = len(S) - t_sort = sorted(zip(L, range(nScores))) - indices = [j for i, j in t_sort] - s_sorted = [S[i] for i in indices] - l_sorted = [L[i] for i in indices] - if nScores < 100: - # then we can't split them into bins, return all for fitting - return l_sorted, s_sorted - nInBins = 1000 if nScores > 5000 else (200 if nScores > 1000 else 20) - nBins, remainder = divmod(nScores, nInBins) - topScores = [] - topLengths = [] - for i in range(nBins): - first = i*nInBins - last = min((i+1)*nInBins-1, nScores - 1) - theseLengths = l_sorted[first:last+1] - theseScores = s_sorted[first:last+1] - cutOff = np.percentile(theseScores, percentileToKeep) - lengthsToKeep = [thisL for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff] - topLengths.extend(lengthsToKeep) - topScores.extend([thisScore for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff]) - return topLengths, topScores - - @staticmethod - def CalculateFittingParameters(Lf, S): - # curve_fit doc.: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html - Slog = np.log10(S) - pars, covar = curve_fit(scnorm.loglinear, Lf, Slog) - scnorm.DrawLogLinear(pars, Lf, Slog) - return pars - - @staticmethod - def NormaliseScoresByLogLengthProduct(b, Lq, Lh, params): - rangeq = list(range(len(Lq))) - rangeh = list(range(len(Lh))) - li_vals = Lq**(-params[0]) - lj_vals = Lh**(-params[0]) - # l{i,j}_matrix are built using l{i,j}_vals data using (range{q,h},range{q,h}) positions - li_matrix = sparse.csr_matrix((li_vals, (rangeq, rangeq))) - lj_matrix = sparse.csr_matrix((lj_vals, (rangeh, rangeh))) - # lil_matrix named as Bij in ProcessBlastHits - return sparse.lil_matrix(10**(-params[1]) * li_matrix * b * lj_matrix) - - @staticmethod - def DrawLogLinear(params, x, y): - nbvals=5000 - maxBS=2500 - minBS=0 - x = np.linspace(minBS, maxBS, nbvals) - plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter)) - plt.legend() - plt.title("Normalisation fitting curve for species") - plt.grid(True) - #plt.savefig('foo'+str(counter)+'.png') - plt.savefig('fitting_curve.png') - global counter - counter+=1 - - """ RunInfo @@ -509,12 +424,12 @@ def WriteGraph_perSpecies(args): class WaterfallMethod: @staticmethod def NormaliseScores(B, Lengths, iSpecies, jSpecies): - Li, Lj, scores = scnorm.GetLengthArraysForMatrix(B, Lengths[iSpecies], Lengths[jSpecies]) + Li, Lj, scores = sc.GetLengthArraysForMatrix(B, Lengths[iSpecies], Lengths[jSpecies]) Lf = Li * Lj - topLf, topScores = scnorm.GetTopPercentileOfScores(Lf, scores, 95) + topLf, topScores = sc.GetTopPercentileOfScores(Lf, scores, 95) if len(topScores) > 1: - fittingParameters = scnorm.CalculateFittingParameters(topLf, topScores) - return scnorm.NormaliseScoresByLogLengthProduct(B, Lengths[iSpecies], Lengths[jSpecies], fittingParameters) + fittingParameters = sc.CalculateFittingParameters(topLf, topScores) + return sc.NormaliseScoresByLogLengthProduct(B, Lengths[iSpecies], Lengths[jSpecies], fittingParameters) else: print("WARNING: Too few hits between species %d and species %d to normalise the scores, these hits will be ignored" % (iSpecies, jSpecies)) return sparse.lil_matrix(B.get_shape()) diff --git a/orthofinder/scripts/blast_file_processor.py b/orthofinder/scripts/blast_file_processor.py index 232b2f7..be32ead 100644 --- a/orthofinder/scripts/blast_file_processor.py +++ b/orthofinder/scripts/blast_file_processor.py @@ -140,6 +140,8 @@ def WriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", qD raise try: newscore = B[sequence1ID, sequence2ID] + #if newscore == "0.0": + # FixZeroNormScore(int(row[12])) row.append(newscore) except IndexError as e: sys.stderr.write("\nERROR: Matrix index on Blast species %d %d, sequences B[%d, %d] : %s \n" % (iSpeciesOpen, jSpeciesOpen, sequence1ID, sequence2ID, e)) diff --git a/orthofinder/scripts/scnorm.py b/orthofinder/scripts/scnorm.py new file mode 100644 index 0000000..d28a4ef --- /dev/null +++ b/orthofinder/scripts/scnorm.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2014 David Emms +# +# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# When publishing work that uses OrthoFinder please cite: +# Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically +# improves orthogroup inference accuracy, Genome Biology 16:157 +# +# For any enquiries send an email to David Emms +# david_emms@hotmail.com + + +from scipy.optimize import curve_fit # install +import scipy.sparse as sparse # install +import numpy as np +import matplotlib as mpl # install +mpl.use('Agg') +import matplotlib.pyplot as plt # install + + +counter=0 + +""" +scnorm +------------------------------------------------------------------------------- +""" +class scnorm: + @staticmethod + def loglinear(x, a, b): + # return the loglinear function + return a*np.log10(x)+b + + @staticmethod + def GetLengthArraysForMatrix(m, len_i, len_j): + # return (I,j) positions non zeros values for the m matrix + I, J = m.nonzero() + scores = [v for row in m.data for v in row] # use fact that it's lil + Li = np.array(len_i[I]) + Lj = np.array(len_j[J]) + return Li, Lj, scores + + @staticmethod + def GetTopPercentileOfScores(L, S, percentileToKeep): + # Get the top x% of hits at each length + nScores = len(S) + t_sort = sorted(zip(L, range(nScores))) + indices = [j for i, j in t_sort] + s_sorted = [S[i] for i in indices] + l_sorted = [L[i] for i in indices] + if nScores < 100: + # then we can't split them into bins, return all for fitting + return l_sorted, s_sorted + nInBins = 1000 if nScores > 5000 else (200 if nScores > 1000 else 20) + nBins, remainder = divmod(nScores, nInBins) + topScores = [] + topLengths = [] + for i in range(nBins): + first = i*nInBins + last = min((i+1)*nInBins-1, nScores - 1) + theseLengths = l_sorted[first:last+1] + theseScores = s_sorted[first:last+1] + cutOff = np.percentile(theseScores, percentileToKeep) + lengthsToKeep = [thisL for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff] + topLengths.extend(lengthsToKeep) + topScores.extend([thisScore for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff]) + return topLengths, topScores + + @staticmethod + def CalculateFittingParameters(Lf, S): + # curve_fit doc.: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html + Slog = np.log10(S) + pars, covar = curve_fit(scnorm.loglinear, Lf, Slog) + scnorm.DrawLogLinear(pars, Lf, Slog) + return pars + + @staticmethod + def NormaliseScoresByLogLengthProduct(b, Lq, Lh, params): + rangeq = list(range(len(Lq))) + rangeh = list(range(len(Lh))) + li_vals = Lq**(-params[0]) + lj_vals = Lh**(-params[0]) + # l{i,j}_matrix are built using l{i,j}_vals data using (range{q,h},range{q,h}) positions + li_matrix = sparse.csr_matrix((li_vals, (rangeq, rangeq))) + lj_matrix = sparse.csr_matrix((lj_vals, (rangeh, rangeh))) + # lil_matrix named as Bij in ProcessBlastHits + return sparse.lil_matrix(10**(-params[1]) * li_matrix * b * lj_matrix) + + @staticmethod + def DrawLogLinear(params, x, y): + nbvals=5000 + maxBS=2500 + minBS=0 + x = np.linspace(minBS, maxBS, nbvals) + plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter)) + plt.legend() + plt.xlabel('Normalized Scores') + plt.ylabel('Bits Scores') + plt.title("Normalisation fitting curve for species") + plt.grid(True) + #plt.savefig('foo'+str(counter)+'.png') + plt.savefig('fitting_curve.png') + global counter + counter+=1 + + @staticmethod + def FixZerosNormalizedBS(x): + Xlog=log10(x) + return Xlog + #return NBS -- GitLab