1
2
3
4
5 """Implementation of frequency (count) matrices, position-weight matrices,
6 and position-specific scoring matrices.
7 """
8
9 import math
10 from Bio.Seq import Seq
11 from Bio.Alphabet import IUPAC
15
26
28 words = ["%6d" % i for i in range(self.length)]
29 line = " " + " ".join(words)
30 lines = [line]
31 for letter in self._letters:
32 words = ["%6.2f" % value for value in self[letter]]
33 line = "%c: " % letter + " ".join(words)
34 lines.append(line)
35 text = "\n".join(lines) + "\n"
36 return text
37
39 if isinstance(key, tuple):
40 if len(key)==2:
41 key1, key2 = key
42 if isinstance(key1, slice):
43 start1, stop1, stride1 = key1.indices(len(self._letters))
44 indices1 = range(start1, stop1, stride1)
45 letters1 = [self._letters[i] for i in indices1]
46 dim1 = 2
47 elif isinstance(key1, int):
48 letter1 = self._letters[key1]
49 dim1 = 1
50 elif isinstance(key1, tuple):
51 letters1 = [self._letters[i] for i in key1]
52 dim1 = 2
53 elif isinstance(key1, str):
54 if len(key1)==1:
55 letter1 = key1
56 dim1 = 1
57 else:
58 raise KeyError(key1)
59 else:
60 raise KeyError("Cannot understand key %s", str(key1))
61 if isinstance(key2, slice):
62 start2, stop2, stride2 = key2.indices(self.length)
63 indices2 = range(start2, stop2, stride2)
64 dim2 = 2
65 elif isinstance(key2, int):
66 index2 = key2
67 dim2 = 1
68 else:
69 raise KeyError("Cannot understand key %s", str(key2))
70 if dim1==1 and dim2==1:
71 return dict.__getitem__(self, letter1)[index2]
72 elif dim1==1 and dim2==2:
73 values = dict.__getitem__(self, letter1)
74 return tuple(values[index2] for index2 in indices2)
75 elif dim1==2 and dim2==1:
76 d = {}
77 for letter1 in letters1:
78 d[letter1] = dict.__getitem__(self, letter1)[index2]
79 return d
80 else:
81 d = {}
82 for letter1 in letters1:
83 values = dict.__getitem__(self, letter1)
84 d[letter1] = [values[index2] for index2 in indices2]
85 if sorted(letters1)==self._letters:
86 return self.__class__(self.alphabet, d)
87 else:
88 return d
89 elif len(key)==1:
90 key = key[0]
91 else:
92 raise KeyError("keys should be 1- or 2-dimensional")
93 if isinstance(key, slice):
94 start, stop, stride = key.indices(len(self._letters))
95 indices = range(start, stop, stride)
96 letters = [self._letters[i] for i in indices]
97 dim = 2
98 elif isinstance(key, int):
99 letter = self._letters[key]
100 dim = 1
101 elif isinstance(key, tuple):
102 letters = [self._letters[i] for i in key]
103 dim = 2
104 elif isinstance(key, str):
105 if len(key)==1:
106 letter = key
107 dim = 1
108 else:
109 raise KeyError(key)
110 else:
111 raise KeyError("Cannot understand key %s", str(key))
112 if dim==1:
113 return dict.__getitem__(self, letter)
114 elif dim==2:
115 d = {}
116 for letter in letters:
117 d[letter] = dict.__getitem__(self, letter)
118 return d
119 else:
120 raise RuntimeError("Should not get here")
121
122 @property
141
142 @property
159
160 @property
162
163
164
165
166
167 degenerate_nucleotide = {
168 'A': 'A',
169 'C': 'C',
170 'G': 'G',
171 'T': 'T',
172 'AC': 'M',
173 'AG': 'R',
174 'AT': 'W',
175 'CG': 'S',
176 'CT': 'Y',
177 'GT': 'K',
178 'ACG': 'V',
179 'ACT': 'H',
180 'AGT': 'D',
181 'CGT': 'B',
182 'ACGT': 'N',
183 }
184 sequence = ""
185 for i in range(self.length):
186 def get(nucleotide):
187 return self[nucleotide][i]
188 nucleotides = sorted(self, key=get, reverse=True)
189 counts = [self[c][i] for c in nucleotides]
190
191 if counts[0] >= sum(counts[1:]) and counts[0] >= 2*counts[1]:
192 key = nucleotides[0]
193 elif 4*sum(counts[:2]) > 3*sum(counts):
194 key = "".join(sorted(nucleotides[:2]))
195 elif counts[3]==0:
196 key = "".join(sorted(nucleotides[:3]))
197 else:
198 key = "ACGT"
199 nucleotide = degenerate_nucleotide[key]
200 sequence += nucleotide
201 return Seq(sequence, alphabet = IUPAC.ambiguous_dna)
202
211
214
216 """
217 create and return a position-weight matrix by normalizing the counts matrix.
218
219 If pseudocounts is None (default), no pseudocounts are added
220 to the counts.
221 If pseudocounts is a number, it is added to the counts before
222 calculating the position-weight matrix.
223 Alternatively, the pseudocounts can be a dictionary with a key
224 for each letter in the alphabet associated with the motif.
225 """
226
227 counts = {}
228 if pseudocounts is None:
229 for letter in self.alphabet.letters:
230 counts[letter] = [0.0] * self.length
231 elif isinstance(pseudocounts, dict):
232 for letter in self.alphabet.letters:
233 counts[letter] = [float(pseudocounts[letter])] * self.length
234 else:
235 for letter in self.alphabet.letters:
236 counts[letter] = [float(pseudocounts)] * self.length
237 for i in xrange(self.length):
238 for letter in self.alphabet.letters:
239 counts[letter][i] += self[letter][i]
240
241 return PositionWeightMatrix(self.alphabet, counts)
242
298
301
303 """
304 returns the PWM score for a given sequence for all positions.
305
306 - the sequence can only be a DNA sequence
307 - the search is performed only on one strand
308 - if the sequence and the motif have the same length, a single
309 number is returned
310 - otherwise, the result is a one-dimensional list or numpy array
311 """
312 if self.alphabet!=IUPAC.unambiguous_dna:
313 raise ValueError("Wrong alphabet! Use only with DNA motifs")
314 if sequence.alphabet!=IUPAC.unambiguous_dna:
315 raise ValueError("Wrong alphabet! Use only with DNA sequences")
316
317 sequence = str(sequence)
318 m = self.length
319 n = len(sequence)
320
321 scores = []
322
323 try:
324 import _pwm
325 except ImportError:
326
327 for i in xrange(n-m+1):
328 score = 0.0
329 for position in xrange(m):
330 letter = sequence[i+position]
331 score += self[letter][position]
332 scores.append(score)
333 else:
334
335
336 logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)]
337 scores = _pwm.calculate(sequence, logodds)
338 if len(scores)==1:
339 return scores[0]
340 else:
341 return scores
342
343 - def search(self, sequence, threshold=0.0, both=True):
361
362 @property
364 """Maximal possible score for this motif.
365
366 returns the score computed for the consensus sequence.
367 """
368 score = 0.0
369 letters = self._letters
370 for position in xrange(0,self.length):
371 score += max([self[letter][position] for letter in letters])
372 return score
373
374 @property
376 """Minimal possible score for this motif.
377
378 returns the score computed for the anticonsensus sequence.
379 """
380 score = 0.0
381 letters = self._letters
382 for position in xrange(0,self.length):
383 score += min([self[letter][position] for letter in letters])
384 return score
385
386 - def mean(self, background=None):
403
404 - def std(self, background=None):
405 """Standard deviation of the score of a motif."""
406 if background is None:
407 background = dict.fromkeys(self._letters, 1.0)
408 else:
409 background = dict(background)
410 total = sum(background.values())
411 for letter in self._letters:
412 background[letter] /= total
413 variance = 0.0
414 for i in range(self.length):
415 sx = 0.0
416 sxx = 0.0
417 for letter in self._letters:
418 logodds = self[letter,i]
419 b = background[letter]
420 p = b * math.pow(2,logodds)
421 sx += p*logodds
422 sxx += p*logodds*logodds
423 sxx -= sx*sx
424 variance += sxx
425 variance = max(variance, 0)
426 return math.sqrt(variance)
427
429 """
430 return the similarity score based on pearson correlation for the given motif against self.
431
432 We use the Pearson's correlation of the respective probabilities.
433 """
434 if self.alphabet != other.alphabet:
435 raise ValueError("Cannot compare motifs with different alphabets")
436
437 max_p=-2
438 for offset in range(-self.length+1, other.length):
439 if offset<0:
440 p = self.dist_pearson_at(other, -offset)
441 else:
442 p = other.dist_pearson_at(self, offset)
443 if max_p<p:
444 max_p=p
445 max_o=-offset
446 return 1-max_p,max_o
447
449 letters = self._letters
450 sx = 0.0
451 sy = 0.0
452 sxx = 0.0
453 sxy = 0.0
454 syy = 0.0
455 norm=max(self.length,offset+other.length)*len(letters)
456 for pos in range(min(self.length-offset, other.length)):
457 xi = [self[letter,pos+offset] for letter in letters]
458 yi = [other[letter,pos] for letter in letters]
459 sx += sum(xi)
460 sy += sum(yi)
461 sxx += sum([x*x for x in xi])
462 sxy += sum([x*y for x,y in zip(xi,yi)])
463 syy += sum([y*y for y in yi])
464 sx /= norm
465 sy /= norm
466 sxx /= norm
467 sxy /= norm
468 syy /= norm
469 numerator = sxy - sx*sy
470 denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy))
471 return numerator/denominator
472
484