Package Bio :: Package motifs :: Module matrix
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.matrix

  1  # Copyright 2013 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Implementation of frequency (count) matrices, position-weight matrices, 
  6  and position-specific scoring matrices. 
  7  """ 
  8   
  9  import math 
 10  from Bio.Seq import Seq 
 11  from Bio.Alphabet import IUPAC 
12 13 14 -class GenericPositionMatrix(dict):
15
16 - def __init__(self, alphabet, values):
17 self.length = None 18 for letter in alphabet.letters: 19 if self.length is None: 20 self.length = len(values[letter]) 21 elif self.length!=len(values[letter]): 22 raise Exception("data has inconsistent lengths") 23 self[letter] = list(values[letter]) 24 self.alphabet = alphabet 25 self._letters = sorted(self.alphabet.letters)
26
27 - def __str__(self):
28 words = ["%6d" % i for i in range(self.length)] 29 line = " " + " ".join(words) 30 lines = [line] 31 for letter in self._letters: 32 words = ["%6.2f" % value for value in self[letter]] 33 line = "%c: " % letter + " ".join(words) 34 lines.append(line) 35 text = "\n".join(lines) + "\n" 36 return text
37
38 - def __getitem__(self, key):
39 if isinstance(key, tuple): 40 if len(key)==2: 41 key1, key2 = key 42 if isinstance(key1, slice): 43 start1, stop1, stride1 = key1.indices(len(self._letters)) 44 indices1 = range(start1, stop1, stride1) 45 letters1 = [self._letters[i] for i in indices1] 46 dim1 = 2 47 elif isinstance(key1, int): 48 letter1 = self._letters[key1] 49 dim1 = 1 50 elif isinstance(key1, tuple): 51 letters1 = [self._letters[i] for i in key1] 52 dim1 = 2 53 elif isinstance(key1, str): 54 if len(key1)==1: 55 letter1 = key1 56 dim1 = 1 57 else: 58 raise KeyError(key1) 59 else: 60 raise KeyError("Cannot understand key %s", str(key1)) 61 if isinstance(key2, slice): 62 start2, stop2, stride2 = key2.indices(self.length) 63 indices2 = range(start2, stop2, stride2) 64 dim2 = 2 65 elif isinstance(key2, int): 66 index2 = key2 67 dim2 = 1 68 else: 69 raise KeyError("Cannot understand key %s", str(key2)) 70 if dim1==1 and dim2==1: 71 return dict.__getitem__(self, letter1)[index2] 72 elif dim1==1 and dim2==2: 73 values = dict.__getitem__(self, letter1) 74 return tuple(values[index2] for index2 in indices2) 75 elif dim1==2 and dim2==1: 76 d = {} 77 for letter1 in letters1: 78 d[letter1] = dict.__getitem__(self, letter1)[index2] 79 return d 80 else: 81 d = {} 82 for letter1 in letters1: 83 values = dict.__getitem__(self, letter1) 84 d[letter1] = [values[index2] for index2 in indices2] 85 if sorted(letters1)==self._letters: 86 return self.__class__(self.alphabet, d) 87 else: 88 return d 89 elif len(key)==1: 90 key = key[0] 91 else: 92 raise KeyError("keys should be 1- or 2-dimensional") 93 if isinstance(key, slice): 94 start, stop, stride = key.indices(len(self._letters)) 95 indices = range(start, stop, stride) 96 letters = [self._letters[i] for i in indices] 97 dim = 2 98 elif isinstance(key, int): 99 letter = self._letters[key] 100 dim = 1 101 elif isinstance(key, tuple): 102 letters = [self._letters[i] for i in key] 103 dim = 2 104 elif isinstance(key, str): 105 if len(key)==1: 106 letter = key 107 dim = 1 108 else: 109 raise KeyError(key) 110 else: 111 raise KeyError("Cannot understand key %s", str(key)) 112 if dim==1: 113 return dict.__getitem__(self, letter) 114 elif dim==2: 115 d = {} 116 for letter in letters: 117 d[letter] = dict.__getitem__(self, letter) 118 return d 119 else: 120 raise RuntimeError("Should not get here")
121 122 @property
123 - def consensus(self):
124 """Returns the consensus sequence. 125 """ 126 sequence = "" 127 for i in range(self.length): 128 try: 129 maximum = float("-inf") 130 except ValueError: 131 # On Python 2.5 or older that was handled in C code, 132 # and failed on Windows XP 32bit 133 maximum = - 1E400 134 for letter in self.alphabet.letters: 135 count = self[letter][i] 136 if count > maximum: 137 maximum = count 138 sequence_letter = letter 139 sequence += sequence_letter 140 return Seq(sequence, self.alphabet)
141 142 @property
143 - def anticonsensus(self):
144 sequence = "" 145 for i in range(self.length): 146 try: 147 minimum = float("inf") 148 except ValueError: 149 # On Python 2.5 or older that was handled in C code, 150 # and failed on Windows XP 32bit 151 minimum = 1E400 152 for letter in self.alphabet.letters: 153 count = self[letter][i] 154 if count < minimum: 155 minimum = count 156 sequence_letter = letter 157 sequence += sequence_letter 158 return Seq(sequence, self.alphabet)
159 160 @property
161 - def degenerate_consensus(self):
162 # Following the rules adapted from 163 # D. R. Cavener: "Comparison of the consensus sequence flanking 164 # translational start sites in Drosophila and vertebrates." 165 # Nucleic Acids Research 15(4): 1353-1361. (1987). 166 # The same rules are used by TRANSFAC. 167 degenerate_nucleotide = { 168 'A': 'A', 169 'C': 'C', 170 'G': 'G', 171 'T': 'T', 172 'AC': 'M', 173 'AG': 'R', 174 'AT': 'W', 175 'CG': 'S', 176 'CT': 'Y', 177 'GT': 'K', 178 'ACG': 'V', 179 'ACT': 'H', 180 'AGT': 'D', 181 'CGT': 'B', 182 'ACGT': 'N', 183 } 184 sequence = "" 185 for i in range(self.length): 186 def get(nucleotide): 187 return self[nucleotide][i]
188 nucleotides = sorted(self, key=get, reverse=True) 189 counts = [self[c][i] for c in nucleotides] 190 # Follow the Cavener rules: 191 if counts[0] >= sum(counts[1:]) and counts[0] >= 2*counts[1]: 192 key = nucleotides[0] 193 elif 4*sum(counts[:2]) > 3*sum(counts): 194 key = "".join(sorted(nucleotides[:2])) 195 elif counts[3]==0: 196 key = "".join(sorted(nucleotides[:3])) 197 else: 198 key = "ACGT" 199 nucleotide = degenerate_nucleotide[key] 200 sequence += nucleotide 201 return Seq(sequence, alphabet = IUPAC.ambiguous_dna)
202
203 - def reverse_complement(self):
204 values = {} 205 values["A"] = self["T"][::-1] 206 values["T"] = self["A"][::-1] 207 values["G"] = self["C"][::-1] 208 values["C"] = self["G"][::-1] 209 alphabet = self.alphabet 210 return self.__class__(alphabet, values)
211
212 213 -class FrequencyPositionMatrix(GenericPositionMatrix):
214
215 - def normalize(self, pseudocounts=None):
216 """ 217 create and return a position-weight matrix by normalizing the counts matrix. 218 219 If pseudocounts is None (default), no pseudocounts are added 220 to the counts. 221 If pseudocounts is a number, it is added to the counts before 222 calculating the position-weight matrix. 223 Alternatively, the pseudocounts can be a dictionary with a key 224 for each letter in the alphabet associated with the motif. 225 """ 226 227 counts = {} 228 if pseudocounts is None: 229 for letter in self.alphabet.letters: 230 counts[letter] = [0.0] * self.length 231 elif isinstance(pseudocounts, dict): 232 for letter in self.alphabet.letters: 233 counts[letter] = [float(pseudocounts[letter])] * self.length 234 else: 235 for letter in self.alphabet.letters: 236 counts[letter] = [float(pseudocounts)] * self.length 237 for i in xrange(self.length): 238 for letter in self.alphabet.letters: 239 counts[letter][i] += self[letter][i] 240 # Actual normalization is done in the PositionWeightMatrix initializer 241 return PositionWeightMatrix(self.alphabet, counts)
242
243 244 -class PositionWeightMatrix(GenericPositionMatrix):
245
246 - def __init__(self, alphabet, counts):
247 GenericPositionMatrix.__init__(self, alphabet, counts) 248 for i in xrange(self.length): 249 total = sum([float(self[letter][i]) for letter in alphabet.letters]) 250 for letter in alphabet.letters: 251 self[letter][i] /= total 252 for letter in alphabet.letters: 253 self[letter] = tuple(self[letter])
254
255 - def log_odds(self, background=None):
256 """ 257 returns the Position-Specific Scoring Matrix. 258 259 The Position-Specific Scoring Matrix (PSSM) contains the log-odds 260 scores computed from the probability matrix and the background 261 probabilities. If the background is None, a uniform background 262 distribution is assumed. 263 """ 264 values = {} 265 alphabet = self.alphabet 266 if background is None: 267 background = dict.fromkeys(self._letters, 1.0) 268 else: 269 background = dict(background) 270 total = sum(background.values()) 271 for letter in alphabet.letters: 272 background[letter] /= total 273 values[letter] = [] 274 for i in range(self.length): 275 for letter in alphabet.letters: 276 b = background[letter] 277 if b > 0: 278 p = self[letter][i] 279 if p > 0: 280 logodds = math.log(p/b, 2) 281 else: 282 #TODO - Ensure this has unittest coverage! 283 try: 284 logodds = float("-inf") 285 except ValueError: 286 # On Python 2.5 or older that was handled in C code, 287 # and failed on Windows XP 32bit 288 logodds = - 1E400 289 else: 290 p = self[letter][i] 291 if p > 0: 292 logodds = float("inf") 293 else: 294 logodds = float("nan") 295 values[letter].append(logodds) 296 pssm = PositionSpecificScoringMatrix(alphabet, values) 297 return pssm
298
299 300 -class PositionSpecificScoringMatrix(GenericPositionMatrix):
301
302 - def calculate(self, sequence):
303 """ 304 returns the PWM score for a given sequence for all positions. 305 306 - the sequence can only be a DNA sequence 307 - the search is performed only on one strand 308 - if the sequence and the motif have the same length, a single 309 number is returned 310 - otherwise, the result is a one-dimensional list or numpy array 311 """ 312 if self.alphabet!=IUPAC.unambiguous_dna: 313 raise ValueError("Wrong alphabet! Use only with DNA motifs") 314 if sequence.alphabet!=IUPAC.unambiguous_dna: 315 raise ValueError("Wrong alphabet! Use only with DNA sequences") 316 317 sequence = str(sequence) 318 m = self.length 319 n = len(sequence) 320 321 scores = [] 322 # check if the fast C code can be used 323 try: 324 import _pwm 325 except ImportError: 326 # use the slower Python code otherwise 327 for i in xrange(n-m+1): 328 score = 0.0 329 for position in xrange(m): 330 letter = sequence[i+position] 331 score += self[letter][position] 332 scores.append(score) 333 else: 334 # get the log-odds matrix into a proper shape 335 # (each row contains sorted (ACGT) log-odds values) 336 logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] 337 scores = _pwm.calculate(sequence, logodds) 338 if len(scores)==1: 339 return scores[0] 340 else: 341 return scores
342
343 - def search(self, sequence, threshold=0.0, both=True):
344 """ 345 a generator function, returning found hits in a given sequence with the pwm score higher than the threshold 346 """ 347 sequence = sequence.upper() 348 n = len(sequence) 349 m = self.length 350 if both: 351 rc = self.reverse_complement() 352 for position in xrange(0,n-m+1): 353 s = sequence[position:position+m] 354 score = self.calculate(s) 355 if score > threshold: 356 yield (position, score) 357 if both: 358 score = rc.calculate(s) 359 if score > threshold: 360 yield (position-n, score)
361 362 @property
363 - def max(self):
364 """Maximal possible score for this motif. 365 366 returns the score computed for the consensus sequence. 367 """ 368 score = 0.0 369 letters = self._letters 370 for position in xrange(0,self.length): 371 score += max([self[letter][position] for letter in letters]) 372 return score
373 374 @property
375 - def min(self):
376 """Minimal possible score for this motif. 377 378 returns the score computed for the anticonsensus sequence. 379 """ 380 score = 0.0 381 letters = self._letters 382 for position in xrange(0,self.length): 383 score += min([self[letter][position] for letter in letters]) 384 return score
385
386 - def mean(self, background=None):
387 """Expected value of the score of a motif.""" 388 if background is None: 389 background = dict.fromkeys(self._letters, 1.0) 390 else: 391 background = dict(background) 392 total = sum(background.values()) 393 for letter in self._letters: 394 background[letter] /= total 395 sx = 0.0 396 for i in range(self.length): 397 for letter in self._letters: 398 logodds = self[letter,i] 399 b = background[letter] 400 p = b * math.pow(2,logodds) 401 sx += p * logodds 402 return sx
403
404 - def std(self, background=None):
405 """Standard deviation of the score of a motif.""" 406 if background is None: 407 background = dict.fromkeys(self._letters, 1.0) 408 else: 409 background = dict(background) 410 total = sum(background.values()) 411 for letter in self._letters: 412 background[letter] /= total 413 variance = 0.0 414 for i in range(self.length): 415 sx = 0.0 416 sxx = 0.0 417 for letter in self._letters: 418 logodds = self[letter,i] 419 b = background[letter] 420 p = b * math.pow(2,logodds) 421 sx += p*logodds 422 sxx += p*logodds*logodds 423 sxx -= sx*sx 424 variance += sxx 425 variance = max(variance, 0) # to avoid roundoff problems 426 return math.sqrt(variance)
427
428 - def dist_pearson(self, other):
429 """ 430 return the similarity score based on pearson correlation for the given motif against self. 431 432 We use the Pearson's correlation of the respective probabilities. 433 """ 434 if self.alphabet != other.alphabet: 435 raise ValueError("Cannot compare motifs with different alphabets") 436 437 max_p=-2 438 for offset in range(-self.length+1, other.length): 439 if offset<0: 440 p = self.dist_pearson_at(other, -offset) 441 else: # offset>=0 442 p = other.dist_pearson_at(self, offset) 443 if max_p<p: 444 max_p=p 445 max_o=-offset 446 return 1-max_p,max_o
447
448 - def dist_pearson_at(self, other, offset):
449 letters = self._letters 450 sx = 0.0 # \sum x 451 sy = 0.0 # \sum y 452 sxx = 0.0 # \sum x^2 453 sxy = 0.0 # \sum x \cdot y 454 syy = 0.0 # \sum y^2 455 norm=max(self.length,offset+other.length)*len(letters) 456 for pos in range(min(self.length-offset, other.length)): 457 xi = [self[letter,pos+offset] for letter in letters] 458 yi = [other[letter,pos] for letter in letters] 459 sx += sum(xi) 460 sy += sum(yi) 461 sxx += sum([x*x for x in xi]) 462 sxy += sum([x*y for x,y in zip(xi,yi)]) 463 syy += sum([y*y for y in yi]) 464 sx /= norm 465 sy /= norm 466 sxx /= norm 467 sxy /= norm 468 syy /= norm 469 numerator = sxy - sx*sy 470 denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy)) 471 return numerator/denominator
472
473 - def distribution(self, background=None, precision=10**3):
474 """calculate the distribution of the scores at the given precision.""" 475 from thresholds import ScoreDistribution 476 if background is None: 477 background = dict.fromkeys(self._letters, 1.0) 478 else: 479 background = dict(background) 480 total = sum(background.values()) 481 for letter in self._letters: 482 background[letter] /= total 483 return ScoreDistribution(precision=precision, pssm=self, background=background)
484