From c96bb13c34d4a2f97c0c0333018db058b5a625cf Mon Sep 17 00:00:00 2001
From: remy <remy.dernat@umontpellier.fr>
Date: Sun, 24 Nov 2019 15:42:59 +0100
Subject: [PATCH] update code to load a new class from file

---
 orthofinder/orthofinder.py                  |  95 +--------------
 orthofinder/scripts/blast_file_processor.py |   2 +
 orthofinder/scripts/scnorm.py               | 124 ++++++++++++++++++++
 3 files changed, 131 insertions(+), 90 deletions(-)
 create mode 100644 orthofinder/scripts/scnorm.py

diff --git a/orthofinder/orthofinder.py b/orthofinder/orthofinder.py
index e75d2be..228c719 100755
--- a/orthofinder/orthofinder.py
+++ b/orthofinder/orthofinder.py
@@ -54,10 +54,6 @@ except ImportError:
     import Queue as queue                       # Y
 import warnings                                 # Y
 
-import matplotlib as mpl                        # install
-mpl.use('Agg')
-import matplotlib.pyplot as plt                 # install
-
 PY2 = sys.version_info <= (3,)
 csv_write_mode = 'wb' if PY2 else 'wt'
 
@@ -66,6 +62,7 @@ from scripts import blast_file_processor
 from scripts import util, matrices, orthologues, trees_msa
 from scripts import program_caller
 from scripts import files
+from scripts.scnorm import scnorm as sc
 from scripts import parallel_task_manager
 
 # Get directory containing script/bundle
@@ -74,7 +71,6 @@ if getattr(sys, 'frozen', False):
 else:
     __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
-counter=0
 max_int = sys.maxsize
 ok = False
 while not ok:
@@ -296,87 +292,6 @@ class MCL:
                     fileWriter_counts.writerow(row[:1] + counts_row + [sum(counts_row)])
                 thisOutputWriter.writerow(row)
 
-"""
-scnorm
--------------------------------------------------------------------------------
-"""
-class scnorm:
-    @staticmethod
-    def loglinear(x, a, b):
-        # return the loglinear function
-        return a*np.log10(x)+b
-    
-    @staticmethod
-    def GetLengthArraysForMatrix(m, len_i, len_j):
-        # return (I,j) positions non zeros values for the m matrix
-        I, J = m.nonzero()
-        scores = [v for row in m.data for v in row]     # use fact that it's lil
-        Li = np.array(len_i[I])
-        Lj = np.array(len_j[J])
-        return Li, Lj, scores
-        
-    @staticmethod
-    def GetTopPercentileOfScores(L, S, percentileToKeep):
-        # Get the top x% of hits at each length
-        nScores = len(S)
-        t_sort = sorted(zip(L, range(nScores)))
-        indices = [j for i, j in t_sort]
-        s_sorted = [S[i] for i in indices]
-        l_sorted = [L[i] for i in indices]
-        if nScores < 100:
-            # then we can't split them into bins, return all for fitting
-            return l_sorted, s_sorted
-        nInBins = 1000 if nScores > 5000 else (200 if nScores > 1000 else 20)
-        nBins, remainder = divmod(nScores, nInBins)
-        topScores = []
-        topLengths = []
-        for i in range(nBins):
-            first = i*nInBins
-            last = min((i+1)*nInBins-1, nScores - 1)
-            theseLengths = l_sorted[first:last+1]
-            theseScores = s_sorted[first:last+1]
-            cutOff = np.percentile(theseScores, percentileToKeep)
-            lengthsToKeep = [thisL for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff]
-            topLengths.extend(lengthsToKeep)
-            topScores.extend([thisScore for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff])
-        return topLengths, topScores
-        
-    @staticmethod
-    def CalculateFittingParameters(Lf, S):
-        # curve_fit doc.: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html
-        Slog = np.log10(S)
-        pars, covar =  curve_fit(scnorm.loglinear, Lf, Slog)
-        scnorm.DrawLogLinear(pars, Lf, Slog)
-        return pars
-           
-    @staticmethod
-    def NormaliseScoresByLogLengthProduct(b, Lq, Lh, params): 
-        rangeq = list(range(len(Lq)))
-        rangeh = list(range(len(Lh)))
-        li_vals = Lq**(-params[0])
-        lj_vals = Lh**(-params[0])
-        # l{i,j}_matrix are built using l{i,j}_vals data using (range{q,h},range{q,h}) positions
-        li_matrix = sparse.csr_matrix((li_vals, (rangeq, rangeq)))
-        lj_matrix = sparse.csr_matrix((lj_vals, (rangeh, rangeh)))
-        # lil_matrix named as Bij in ProcessBlastHits
-        return sparse.lil_matrix(10**(-params[1]) * li_matrix * b * lj_matrix)
-    
-    @staticmethod
-    def DrawLogLinear(params, x, y):
-        nbvals=5000
-        maxBS=2500
-        minBS=0
-        x = np.linspace(minBS, maxBS, nbvals)
-        plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter))
-        plt.legend()
-        plt.title("Normalisation fitting curve for species")
-        plt.grid(True)
-        #plt.savefig('foo'+str(counter)+'.png')
-        plt.savefig('fitting_curve.png')
-        global counter
-        counter+=1
-
-        
 
 """
 RunInfo
@@ -509,12 +424,12 @@ def WriteGraph_perSpecies(args):
 class WaterfallMethod:    
     @staticmethod
     def NormaliseScores(B, Lengths, iSpecies, jSpecies):
-        Li, Lj, scores = scnorm.GetLengthArraysForMatrix(B, Lengths[iSpecies], Lengths[jSpecies])
+        Li, Lj, scores = sc.GetLengthArraysForMatrix(B, Lengths[iSpecies], Lengths[jSpecies])
         Lf = Li * Lj
-        topLf, topScores = scnorm.GetTopPercentileOfScores(Lf, scores, 95)
+        topLf, topScores = sc.GetTopPercentileOfScores(Lf, scores, 95)
         if len(topScores) > 1:
-            fittingParameters = scnorm.CalculateFittingParameters(topLf, topScores)
-            return scnorm.NormaliseScoresByLogLengthProduct(B, Lengths[iSpecies], Lengths[jSpecies], fittingParameters)
+            fittingParameters = sc.CalculateFittingParameters(topLf, topScores)
+            return sc.NormaliseScoresByLogLengthProduct(B, Lengths[iSpecies], Lengths[jSpecies], fittingParameters)
         else:
             print("WARNING: Too few hits between species %d and species %d to normalise the scores, these hits will be ignored" % (iSpecies, jSpecies))
             return sparse.lil_matrix(B.get_shape())
diff --git a/orthofinder/scripts/blast_file_processor.py b/orthofinder/scripts/blast_file_processor.py
index 232b2f7..be32ead 100644
--- a/orthofinder/scripts/blast_file_processor.py
+++ b/orthofinder/scripts/blast_file_processor.py
@@ -140,6 +140,8 @@ def WriteNormalizedBlastFile(blastDir_list, B, iSpecies, jSpecies, sep = "_", qD
                     raise
                 try:
                     newscore = B[sequence1ID, sequence2ID]
+                    #if newscore == "0.0":
+                    #    FixZeroNormScore(int(row[12]))
                     row.append(newscore)
                 except IndexError as e:
                     sys.stderr.write("\nERROR: Matrix index on Blast species %d %d, sequences B[%d, %d] : %s \n" % (iSpeciesOpen, jSpeciesOpen, sequence1ID, sequence2ID, e))
diff --git a/orthofinder/scripts/scnorm.py b/orthofinder/scripts/scnorm.py
new file mode 100644
index 0000000..d28a4ef
--- /dev/null
+++ b/orthofinder/scripts/scnorm.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2014 David Emms
+#
+# This program (OrthoFinder) is distributed under the terms of the GNU General Public License v3
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#  
+#  When publishing work that uses OrthoFinder please cite:
+#      Emms, D.M. and Kelly, S. (2015) OrthoFinder: solving fundamental biases in whole genome comparisons dramatically 
+#      improves orthogroup inference accuracy, Genome Biology 16:157
+#
+# For any enquiries send an email to David Emms
+# david_emms@hotmail.com 
+
+
+from scipy.optimize import curve_fit            # install
+import scipy.sparse as sparse                   # install
+import numpy as np
+import matplotlib as mpl                        # install
+mpl.use('Agg')
+import matplotlib.pyplot as plt                 # install
+
+
+counter=0
+
+"""
+scnorm
+-------------------------------------------------------------------------------
+"""
+class scnorm:
+    @staticmethod
+    def loglinear(x, a, b):
+        # return the loglinear function
+        return a*np.log10(x)+b
+
+    @staticmethod
+    def GetLengthArraysForMatrix(m, len_i, len_j):
+        # return (I,j) positions non zeros values for the m matrix
+        I, J = m.nonzero()
+        scores = [v for row in m.data for v in row]     # use fact that it's lil
+        Li = np.array(len_i[I])
+        Lj = np.array(len_j[J])
+        return Li, Lj, scores
+
+    @staticmethod
+    def GetTopPercentileOfScores(L, S, percentileToKeep):
+        # Get the top x% of hits at each length
+        nScores = len(S)
+        t_sort = sorted(zip(L, range(nScores)))
+        indices = [j for i, j in t_sort]
+        s_sorted = [S[i] for i in indices]
+        l_sorted = [L[i] for i in indices]
+        if nScores < 100:
+            # then we can't split them into bins, return all for fitting
+            return l_sorted, s_sorted
+        nInBins = 1000 if nScores > 5000 else (200 if nScores > 1000 else 20)
+        nBins, remainder = divmod(nScores, nInBins)
+        topScores = []
+        topLengths = []
+        for i in range(nBins):
+            first = i*nInBins
+            last = min((i+1)*nInBins-1, nScores - 1)
+            theseLengths = l_sorted[first:last+1]
+            theseScores = s_sorted[first:last+1]
+            cutOff = np.percentile(theseScores, percentileToKeep)
+            lengthsToKeep = [thisL for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff]
+            topLengths.extend(lengthsToKeep)
+            topScores.extend([thisScore for thisL, thisScore in zip(theseLengths, theseScores) if thisScore >= cutOff])
+        return topLengths, topScores
+
+    @staticmethod
+    def CalculateFittingParameters(Lf, S):
+        # curve_fit doc.: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html
+        Slog = np.log10(S)
+        pars, covar =  curve_fit(scnorm.loglinear, Lf, Slog)
+        scnorm.DrawLogLinear(pars, Lf, Slog)
+        return pars
+
+    @staticmethod
+    def NormaliseScoresByLogLengthProduct(b, Lq, Lh, params):
+        rangeq = list(range(len(Lq)))
+        rangeh = list(range(len(Lh)))
+        li_vals = Lq**(-params[0])
+        lj_vals = Lh**(-params[0])
+        # l{i,j}_matrix are built using l{i,j}_vals data using (range{q,h},range{q,h}) positions
+        li_matrix = sparse.csr_matrix((li_vals, (rangeq, rangeq)))
+        lj_matrix = sparse.csr_matrix((lj_vals, (rangeh, rangeh)))
+        # lil_matrix named as Bij in ProcessBlastHits
+        return sparse.lil_matrix(10**(-params[1]) * li_matrix * b * lj_matrix)
+
+    @staticmethod
+    def DrawLogLinear(params, x, y):
+        nbvals=5000
+        maxBS=2500
+        minBS=0
+        x = np.linspace(minBS, maxBS, nbvals)
+        plt.plot(x, scnorm.loglinear(x, *params), label='species '+str(counter))
+        plt.legend()
+        plt.xlabel('Normalized Scores')
+        plt.ylabel('Bits Scores')
+        plt.title("Normalisation fitting curve for species")
+        plt.grid(True)
+        #plt.savefig('foo'+str(counter)+'.png')
+        plt.savefig('fitting_curve.png')
+        global counter
+        counter+=1
+
+    @staticmethod
+    def FixZerosNormalizedBS(x):
+        Xlog=log10(x)
+        return Xlog
+        #return NBS
-- 
GitLab