Package Bio :: Package motifs :: Module thresholds
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.thresholds

  1  # Copyright 2008 by Norbert Dojer.  All rights reserved. 
  2  # Adapted by Bartek Wilczynski. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Approximate calculation of appropriate thresholds for motif finding 
  7  """ 
  8   
  9   
10 -class ScoreDistribution(object):
11 """ Class representing approximate score distribution for a given motif. 12 13 Utilizes a dynamic programming approach to calculate the distribution of 14 scores with a predefined precision. Provides a number of methods for calculating 15 thresholds for motif occurrences. 16 """
17 - def __init__(self, motif=None, precision=10 ** 3, pssm=None, background=None):
18 if pssm is None: 19 self.min_score = min(0.0, motif.min_score()) 20 self.interval = max(0.0, motif.max_score()) - self.min_score 21 self.n_points = precision * motif.length 22 self.ic = motif.ic() 23 else: 24 self.min_score = min(0.0, pssm.min) 25 self.interval = max(0.0, pssm.max) - self.min_score 26 self.n_points = precision * pssm.length 27 self.ic = pssm.mean(background) 28 self.step = self.interval / (self.n_points - 1) 29 self.mo_density = [0.0] * self.n_points 30 self.mo_density[-self._index_diff(self.min_score)] = 1.0 31 self.bg_density = [0.0] * self.n_points 32 self.bg_density[-self._index_diff(self.min_score)] = 1.0 33 if pssm is None: 34 for lo, mo in zip(motif.log_odds(), motif.pwm()): 35 self.modify(lo, mo, motif.background) 36 else: 37 for position in range(pssm.length): 38 mo_new = [0.0] * self.n_points 39 bg_new = [0.0] * self.n_points 40 lo = pssm[:, position] 41 for letter, score in lo.items(): 42 bg = background[letter] 43 mo = pow(2, pssm[letter, position]) * bg 44 d = self._index_diff(score) 45 for i in range(self.n_points): 46 mo_new[self._add(i, d)] += self.mo_density[i] * mo 47 bg_new[self._add(i, d)] += self.bg_density[i] * bg 48 self.mo_density = mo_new 49 self.bg_density = bg_new
50
51 - def _index_diff(self, x, y=0.0):
52 return int((x - y + 0.5 * self.step) // self.step)
53
54 - def _add(self, i, j):
55 return max(0, min(self.n_points - 1, i + j))
56
57 - def modify(self, scores, mo_probs, bg_probs):
58 mo_new = [0.0] * self.n_points 59 bg_new = [0.0] * self.n_points 60 for k, v in scores.items(): 61 d = self._index_diff(v) 62 for i in range(self.n_points): 63 mo_new[self._add(i, d)] += self.mo_density[i] * mo_probs[k] 64 bg_new[self._add(i, d)] += self.bg_density[i] * bg_probs[k] 65 self.mo_density = mo_new 66 self.bg_density = bg_new
67
68 - def threshold_fpr(self, fpr):
69 """ 70 Approximate the log-odds threshold which makes the type I error (false positive rate). 71 """ 72 i = self.n_points 73 prob = 0.0 74 while prob < fpr: 75 i -= 1 76 prob += self.bg_density[i] 77 return self.min_score + i * self.step
78
79 - def threshold_fnr(self, fnr):
80 """ 81 Approximate the log-odds threshold which makes the type II error (false negative rate). 82 """ 83 i = -1 84 prob = 0.0 85 while prob < fnr: 86 i += 1 87 prob += self.mo_density[i] 88 return self.min_score + i * self.step
89
90 - def threshold_balanced(self, rate_proportion=1.0, return_rate=False):
91 """ 92 Approximate the log-odds threshold which makes FNR equal to FPR times rate_proportion 93 """ 94 i = self.n_points 95 fpr = 0.0 96 fnr = 1.0 97 while fpr * rate_proportion < fnr: 98 i -= 1 99 fpr += self.bg_density[i] 100 fnr -= self.mo_density[i] 101 if return_rate: 102 return self.min_score + i * self.step, fpr 103 else: 104 return self.min_score + i * self.step
105
106 - def threshold_patser(self):
107 """Threshold selection mimicking the behaviour of patser (Hertz, Stormo 1999) software. 108 109 It selects such a threshold that the log(fpr)=-ic(M) 110 note: the actual patser software uses natural logarithms instead of log_2, so the numbers 111 are not directly comparable. 112 """ 113 return self.threshold_fpr(fpr=2 ** -self.ic)
114