Package Bio :: Package motifs :: Module matrix
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.matrix

  1  # Copyright 2013 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Implementation of frequency (count) matrices, position-weight matrices, 
  6  and position-specific scoring matrices. 
  7  """ 
  8   
  9  import math 
 10   
 11  from Bio._py3k import range 
 12   
 13  from Bio.Seq import Seq 
 14  from Bio.Alphabet import IUPAC 
 15   
 16  # Hack for Python 2.5, isnan and isinf were new in Python 2.6 
 17  try: 
 18      from math import isnan as _isnan 
 19  except ImportError: 
20 - def _isnan(value):
21 # This is tricky due to cross platform float differences 22 if str(value).lower() == "nan": 23 return True 24 return value != value
25 try: 26 from math import isinf as _isinf 27 except ImportError:
28 - def _isinf(value):
29 # This is tricky due to cross platform float differences 30 if str(value).lower().endswith("inf"): 31 return True 32 return False
33 # Hack for Python 2.5 on Windows: 34 try: 35 _nan = float("nan") 36 except ValueError: 37 _nan = 1e1000 / 1e1000
38 39 40 -class GenericPositionMatrix(dict):
41
42 - def __init__(self, alphabet, values):
43 self.length = None 44 for letter in alphabet.letters: 45 if self.length is None: 46 self.length = len(values[letter]) 47 elif self.length!=len(values[letter]): 48 raise Exception("data has inconsistent lengths") 49 self[letter] = list(values[letter]) 50 self.alphabet = alphabet 51 self._letters = sorted(self.alphabet.letters)
52
53 - def __str__(self):
54 words = ["%6d" % i for i in range(self.length)] 55 line = " " + " ".join(words) 56 lines = [line] 57 for letter in self._letters: 58 words = ["%6.2f" % value for value in self[letter]] 59 line = "%c: " % letter + " ".join(words) 60 lines.append(line) 61 text = "\n".join(lines) + "\n" 62 return text
63
64 - def __getitem__(self, key):
65 if isinstance(key, tuple): 66 if len(key)==2: 67 key1, key2 = key 68 if isinstance(key1, slice): 69 start1, stop1, stride1 = key1.indices(len(self._letters)) 70 indices1 = range(start1, stop1, stride1) 71 letters1 = [self._letters[i] for i in indices1] 72 dim1 = 2 73 elif isinstance(key1, int): 74 letter1 = self._letters[key1] 75 dim1 = 1 76 elif isinstance(key1, tuple): 77 letters1 = [self._letters[i] for i in key1] 78 dim1 = 2 79 elif isinstance(key1, str): 80 if len(key1)==1: 81 letter1 = key1 82 dim1 = 1 83 else: 84 raise KeyError(key1) 85 else: 86 raise KeyError("Cannot understand key %s", str(key1)) 87 if isinstance(key2, slice): 88 start2, stop2, stride2 = key2.indices(self.length) 89 indices2 = range(start2, stop2, stride2) 90 dim2 = 2 91 elif isinstance(key2, int): 92 index2 = key2 93 dim2 = 1 94 else: 95 raise KeyError("Cannot understand key %s", str(key2)) 96 if dim1==1 and dim2==1: 97 return dict.__getitem__(self, letter1)[index2] 98 elif dim1==1 and dim2==2: 99 values = dict.__getitem__(self, letter1) 100 return tuple(values[index2] for index2 in indices2) 101 elif dim1==2 and dim2==1: 102 d = {} 103 for letter1 in letters1: 104 d[letter1] = dict.__getitem__(self, letter1)[index2] 105 return d 106 else: 107 d = {} 108 for letter1 in letters1: 109 values = dict.__getitem__(self, letter1) 110 d[letter1] = [values[index2] for index2 in indices2] 111 if sorted(letters1)==self._letters: 112 return self.__class__(self.alphabet, d) 113 else: 114 return d 115 elif len(key)==1: 116 key = key[0] 117 else: 118 raise KeyError("keys should be 1- or 2-dimensional") 119 if isinstance(key, slice): 120 start, stop, stride = key.indices(len(self._letters)) 121 indices = range(start, stop, stride) 122 letters = [self._letters[i] for i in indices] 123 dim = 2 124 elif isinstance(key, int): 125 letter = self._letters[key] 126 dim = 1 127 elif isinstance(key, tuple): 128 letters = [self._letters[i] for i in key] 129 dim = 2 130 elif isinstance(key, str): 131 if len(key)==1: 132 letter = key 133 dim = 1 134 else: 135 raise KeyError(key) 136 else: 137 raise KeyError("Cannot understand key %s", str(key)) 138 if dim==1: 139 return dict.__getitem__(self, letter) 140 elif dim==2: 141 d = {} 142 for letter in letters: 143 d[letter] = dict.__getitem__(self, letter) 144 return d 145 else: 146 raise RuntimeError("Should not get here")
147 148 @property
149 - def consensus(self):
150 """Returns the consensus sequence.""" 151 sequence = "" 152 for i in range(self.length): 153 try: 154 maximum = float("-inf") 155 except ValueError: 156 # On Python 2.5 or older that was handled in C code, 157 # and failed on Windows XP 32bit 158 maximum = - 1E400 159 for letter in self.alphabet.letters: 160 count = self[letter][i] 161 if count > maximum: 162 maximum = count 163 sequence_letter = letter 164 sequence += sequence_letter 165 return Seq(sequence, self.alphabet)
166 167 @property
168 - def anticonsensus(self):
169 sequence = "" 170 for i in range(self.length): 171 try: 172 minimum = float("inf") 173 except ValueError: 174 # On Python 2.5 or older that was handled in C code, 175 # and failed on Windows XP 32bit 176 minimum = 1E400 177 for letter in self.alphabet.letters: 178 count = self[letter][i] 179 if count < minimum: 180 minimum = count 181 sequence_letter = letter 182 sequence += sequence_letter 183 return Seq(sequence, self.alphabet)
184 185 @property
186 - def degenerate_consensus(self):
187 # Following the rules adapted from 188 # D. R. Cavener: "Comparison of the consensus sequence flanking 189 # translational start sites in Drosophila and vertebrates." 190 # Nucleic Acids Research 15(4): 1353-1361. (1987). 191 # The same rules are used by TRANSFAC. 192 degenerate_nucleotide = { 193 'A': 'A', 194 'C': 'C', 195 'G': 'G', 196 'T': 'T', 197 'AC': 'M', 198 'AG': 'R', 199 'AT': 'W', 200 'CG': 'S', 201 'CT': 'Y', 202 'GT': 'K', 203 'ACG': 'V', 204 'ACT': 'H', 205 'AGT': 'D', 206 'CGT': 'B', 207 'ACGT': 'N', 208 } 209 sequence = "" 210 for i in range(self.length): 211 def get(nucleotide): 212 return self[nucleotide][i]
213 nucleotides = sorted(self, key=get, reverse=True) 214 counts = [self[c][i] for c in nucleotides] 215 # Follow the Cavener rules: 216 if counts[0] >= sum(counts[1:]) and counts[0] >= 2*counts[1]: 217 key = nucleotides[0] 218 elif 4*sum(counts[:2]) > 3*sum(counts): 219 key = "".join(sorted(nucleotides[:2])) 220 elif counts[3]==0: 221 key = "".join(sorted(nucleotides[:3])) 222 else: 223 key = "ACGT" 224 nucleotide = degenerate_nucleotide[key] 225 sequence += nucleotide 226 return Seq(sequence, alphabet=IUPAC.ambiguous_dna)
227 228 @property
229 - def gc_content(self):
230 """Compute the fraction GC content.""" 231 alphabet = self.alphabet 232 gc_total = 0.0 233 total = 0.0 234 for i in range(self.length): 235 for letter in alphabet.letters: 236 if letter in 'CG': 237 gc_total += self[letter][i] 238 total += self[letter][i] 239 return gc_total / total
240
241 - def reverse_complement(self):
242 values = {} 243 values["A"] = self["T"][::-1] 244 values["T"] = self["A"][::-1] 245 values["G"] = self["C"][::-1] 246 values["C"] = self["G"][::-1] 247 alphabet = self.alphabet 248 return self.__class__(alphabet, values)
249
250 251 -class FrequencyPositionMatrix(GenericPositionMatrix):
252
253 - def normalize(self, pseudocounts=None):
254 """Create and return a position-weight matrix by normalizing the counts matrix. 255 256 If pseudocounts is None (default), no pseudocounts are added 257 to the counts. 258 259 If pseudocounts is a number, it is added to the counts before 260 calculating the position-weight matrix. 261 262 Alternatively, the pseudocounts can be a dictionary with a key 263 for each letter in the alphabet associated with the motif. 264 """ 265 counts = {} 266 if pseudocounts is None: 267 for letter in self.alphabet.letters: 268 counts[letter] = [0.0] * self.length 269 elif isinstance(pseudocounts, dict): 270 for letter in self.alphabet.letters: 271 counts[letter] = [float(pseudocounts[letter])] * self.length 272 else: 273 for letter in self.alphabet.letters: 274 counts[letter] = [float(pseudocounts)] * self.length 275 for i in range(self.length): 276 for letter in self.alphabet.letters: 277 counts[letter][i] += self[letter][i] 278 # Actual normalization is done in the PositionWeightMatrix initializer 279 return PositionWeightMatrix(self.alphabet, counts)
280
281 282 -class PositionWeightMatrix(GenericPositionMatrix):
283
284 - def __init__(self, alphabet, counts):
285 GenericPositionMatrix.__init__(self, alphabet, counts) 286 for i in range(self.length): 287 total = sum(float(self[letter][i]) for letter in alphabet.letters) 288 for letter in alphabet.letters: 289 self[letter][i] /= total 290 for letter in alphabet.letters: 291 self[letter] = tuple(self[letter])
292
293 - def log_odds(self, background=None):
294 """Returns the Position-Specific Scoring Matrix. 295 296 The Position-Specific Scoring Matrix (PSSM) contains the log-odds 297 scores computed from the probability matrix and the background 298 probabilities. If the background is None, a uniform background 299 distribution is assumed. 300 """ 301 values = {} 302 alphabet = self.alphabet 303 if background is None: 304 background = dict.fromkeys(self._letters, 1.0) 305 else: 306 background = dict(background) 307 total = sum(background.values()) 308 for letter in alphabet.letters: 309 background[letter] /= total 310 values[letter] = [] 311 for i in range(self.length): 312 for letter in alphabet.letters: 313 b = background[letter] 314 if b > 0: 315 p = self[letter][i] 316 if p > 0: 317 logodds = math.log(p/b, 2) 318 else: 319 # TODO - Ensure this has unittest coverage! 320 try: 321 logodds = float("-inf") 322 except ValueError: 323 # On Python 2.5 or older that was handled in C code, 324 # and failed on Windows XP 32bit 325 logodds = - 1E400 326 else: 327 p = self[letter][i] 328 if p > 0: 329 logodds = float("inf") 330 else: 331 logodds = _nan 332 values[letter].append(logodds) 333 pssm = PositionSpecificScoringMatrix(alphabet, values) 334 return pssm
335
336 337 -class PositionSpecificScoringMatrix(GenericPositionMatrix):
338
339 - def calculate(self, sequence):
340 """Returns the PWM score for a given sequence for all positions. 341 342 Notes: 343 344 - the sequence can only be a DNA sequence 345 - the search is performed only on one strand 346 - if the sequence and the motif have the same length, a single 347 number is returned 348 - otherwise, the result is a one-dimensional list or numpy array 349 """ 350 # TODO - Code itself tolerates ambiguous bases (as NaN). 351 if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): 352 raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" 353 % self.alphabet) 354 if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): 355 raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" 356 % sequence.alphabet) 357 358 # TODO - Force uppercase here and optimise switch statement in C 359 # by assuming upper case? 360 sequence = str(sequence) 361 m = self.length 362 n = len(sequence) 363 364 scores = [] 365 # check if the fast C code can be used 366 try: 367 import _pwm 368 except ImportError: 369 # use the slower Python code otherwise 370 # The C code handles mixed case so Python version must too: 371 sequence = sequence.upper() 372 for i in range(n-m+1): 373 score = 0.0 374 for position in range(m): 375 letter = sequence[i+position] 376 try: 377 score += self[letter][position] 378 except KeyError: 379 score = _nan 380 break 381 scores.append(score) 382 else: 383 # get the log-odds matrix into a proper shape 384 # (each row contains sorted (ACGT) log-odds values) 385 logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] 386 scores = _pwm.calculate(sequence, logodds) 387 if len(scores)==1: 388 return scores[0] 389 else: 390 return scores
391
392 - def search(self, sequence, threshold=0.0, both=True):
393 """Find hits with PWM score above given threshold. 394 395 A generator function, returning found hits in the given sequence 396 with the pwm score higher than the threshold. 397 """ 398 sequence = sequence.upper() 399 n = len(sequence) 400 m = self.length 401 if both: 402 rc = self.reverse_complement() 403 for position in range(0, n-m+1): 404 s = sequence[position:position+m] 405 score = self.calculate(s) 406 if score > threshold: 407 yield (position, score) 408 if both: 409 score = rc.calculate(s) 410 if score > threshold: 411 yield (position-n, score)
412 413 @property
414 - def max(self):
415 """Maximal possible score for this motif. 416 417 returns the score computed for the consensus sequence. 418 """ 419 score = 0.0 420 letters = self._letters 421 for position in range(0, self.length): 422 score += max(self[letter][position] for letter in letters) 423 return score
424 425 @property
426 - def min(self):
427 """Minimal possible score for this motif. 428 429 returns the score computed for the anticonsensus sequence. 430 """ 431 score = 0.0 432 letters = self._letters 433 for position in range(0, self.length): 434 score += min(self[letter][position] for letter in letters) 435 return score
436 437 @property
438 - def gc_content(self):
439 raise Exception("Cannot compute the %GC composition of a PSSM")
440
441 - def mean(self, background=None):
442 """Expected value of the score of a motif.""" 443 if background is None: 444 background = dict.fromkeys(self._letters, 1.0) 445 else: 446 background = dict(background) 447 total = sum(background.values()) 448 for letter in self._letters: 449 background[letter] /= total 450 sx = 0.0 451 for i in range(self.length): 452 for letter in self._letters: 453 logodds = self[letter, i] 454 if _isnan(logodds): 455 continue 456 if _isinf(logodds) and logodds < 0: 457 continue 458 b = background[letter] 459 p = b * math.pow(2, logodds) 460 sx += p * logodds 461 return sx
462
463 - def std(self, background=None):
464 """Standard deviation of the score of a motif.""" 465 if background is None: 466 background = dict.fromkeys(self._letters, 1.0) 467 else: 468 background = dict(background) 469 total = sum(background.values()) 470 for letter in self._letters: 471 background[letter] /= total 472 variance = 0.0 473 for i in range(self.length): 474 sx = 0.0 475 sxx = 0.0 476 for letter in self._letters: 477 logodds = self[letter, i] 478 if _isnan(logodds): 479 continue 480 if _isinf(logodds) and logodds < 0: 481 continue 482 b = background[letter] 483 p = b * math.pow(2, logodds) 484 sx += p*logodds 485 sxx += p*logodds*logodds 486 sxx -= sx*sx 487 variance += sxx 488 variance = max(variance, 0) # to avoid roundoff problems 489 return math.sqrt(variance)
490
491 - def dist_pearson(self, other):
492 """Return the similarity score based on pearson correlation for the given motif against self. 493 494 We use the Pearson's correlation of the respective probabilities. 495 """ 496 if self.alphabet != other.alphabet: 497 raise ValueError("Cannot compare motifs with different alphabets") 498 499 max_p=-2 500 for offset in range(-self.length+1, other.length): 501 if offset<0: 502 p = self.dist_pearson_at(other, -offset) 503 else: # offset>=0 504 p = other.dist_pearson_at(self, offset) 505 if max_p<p: 506 max_p=p 507 max_o=-offset 508 return 1-max_p, max_o
509
510 - def dist_pearson_at(self, other, offset):
511 letters = self._letters 512 sx = 0.0 # \sum x 513 sy = 0.0 # \sum y 514 sxx = 0.0 # \sum x^2 515 sxy = 0.0 # \sum x \cdot y 516 syy = 0.0 # \sum y^2 517 norm=max(self.length, offset+other.length)*len(letters) 518 for pos in range(min(self.length-offset, other.length)): 519 xi = [self[letter, pos+offset] for letter in letters] 520 yi = [other[letter, pos] for letter in letters] 521 sx += sum(xi) 522 sy += sum(yi) 523 sxx += sum(x*x for x in xi) 524 sxy += sum(x*y for x, y in zip(xi, yi)) 525 syy += sum(y*y for y in yi) 526 sx /= norm 527 sy /= norm 528 sxx /= norm 529 sxy /= norm 530 syy /= norm 531 numerator = sxy - sx*sy 532 denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy)) 533 return numerator/denominator
534
535 - def distribution(self, background=None, precision=10**3):
536 """calculate the distribution of the scores at the given precision.""" 537 from .thresholds import ScoreDistribution 538 if background is None: 539 background = dict.fromkeys(self._letters, 1.0) 540 else: 541 background = dict(background) 542 total = sum(background.values()) 543 for letter in self._letters: 544 background[letter] /= total 545 return ScoreDistribution(precision=precision, pssm=self, background=background)
546