1
2
3
4
5
6 """Approximate calculation of appropriate thresholds for motif finding
7 """
8
9
11 """ Class representing approximate score distribution for a given motif.
12
13 Utilizes a dynamic programming approch to calculate the distribution of
14 scores with a predefined precision. Provides a number of methods for calculating
15 thresholds for motif occurences.
16 """
17 - def __init__(self, motif=None, precision=10**3, pssm=None, background=None):
18 if pssm==None:
19 self.min_score = min(0.0, motif.min_score())
20 self.interval = max(0.0, motif.max_score())-self.min_score
21 self.n_points = precision * motif.length
22 self.ic=motif.ic()
23 else:
24 self.min_score = min(0.0, pssm.min)
25 self.interval = max(0.0, pssm.max)-self.min_score
26 self.n_points = precision * pssm.length
27 self.ic = pssm.mean(background)
28 self.step = self.interval/(self.n_points-1)
29 self.mo_density = [0.0]*self.n_points
30 self.mo_density[-self._index_diff(self.min_score)] = 1.0
31 self.bg_density = [0.0]*self.n_points
32 self.bg_density[-self._index_diff(self.min_score)] = 1.0
33 if pssm==None:
34 for lo,mo in zip(motif.log_odds(),motif.pwm()):
35 self.modify(lo,mo,motif.background)
36 else:
37 for position in range(pssm.length):
38 mo_new=[0.0]*self.n_points
39 bg_new=[0.0]*self.n_points
40 lo = pssm[:,position]
41 for letter, score in lo.iteritems():
42 bg = background[letter]
43 mo = pow(2,pssm[letter,position]) * bg
44 d=self._index_diff(score)
45 for i in range(self.n_points):
46 mo_new[self._add(i,d)]+=self.mo_density[i]*mo
47 bg_new[self._add(i,d)]+=self.bg_density[i]*bg
48 self.mo_density=mo_new
49 self.bg_density=bg_new
50
52 return int((x-y+0.5*self.step)//self.step)
53
55 return max(0,min(self.n_points-1,i+j))
56
57 - def modify(self,scores,mo_probs,bg_probs):
58 mo_new=[0.0]*self.n_points
59 bg_new=[0.0]*self.n_points
60 for k, v in scores.iteritems():
61 d=self._index_diff(v)
62 for i in range(self.n_points):
63 mo_new[self._add(i,d)]+=self.mo_density[i]*mo_probs[k]
64 bg_new[self._add(i,d)]+=self.bg_density[i]*bg_probs[k]
65 self.mo_density=mo_new
66 self.bg_density=bg_new
67
69 """
70 Approximate the log-odds threshold which makes the type I error (false positive rate).
71 """
72 i=self.n_points
73 prob=0.0
74 while prob<fpr:
75 i-=1
76 prob+=self.bg_density[i]
77 return self.min_score+i*self.step
78
80 """
81 Approximate the log-odds threshold which makes the type II error (false negative rate).
82 """
83 i=-1
84 prob=0.0
85 while prob<fnr:
86 i+=1
87 prob+=self.mo_density[i]
88 return self.min_score+i*self.step
89
91 """
92 Approximate the log-odds threshold which makes FNR equal to FPR times rate_proportion
93 """
94 i=self.n_points
95 fpr=0.0
96 fnr=1.0
97 while fpr*rate_proportion<fnr:
98 i-=1
99 fpr+=self.bg_density[i]
100 fnr-=self.mo_density[i]
101 if return_rate:
102 return self.min_score+i*self.step,fpr
103 else:
104 return self.min_score+i*self.step
105
107 """Threshold selection mimicking the behaviour of patser (Hertz, Stormo 1999) software.
108
109 It selects such a threshold that the log(fpr)=-ic(M)
110 note: the actual patser software uses natural logarithms instead of log_2, so the numbers
111 are not directly comparable.
112 """
113 return self.threshold_fpr(fpr=2**-self.ic)
114