Package Bio :: Package motifs :: Module matrix
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.matrix

  1  # Copyright 2013 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Implementation of frequency (count) matrices, position-weight matrices, 
  6  and position-specific scoring matrices. 
  7  """ 
  8   
  9  import math 
 10   
 11  from Bio._py3k import range 
 12   
 13  from Bio.Seq import Seq 
 14  from Bio.Alphabet import IUPAC 
 15   
 16  #Hack for Python 2.5, isnan and isinf were new in Python 2.6 
 17  try: 
 18      from math import isnan as _isnan 
 19  except ImportError: 
20 - def _isnan(value):
21 #This is tricky due to cross platform float differences 22 if str(value).lower() == "nan": 23 return True 24 return value != value
25 try: 26 from math import isinf as _isinf 27 except ImportError:
28 - def _isinf(value):
29 #This is tricky due to cross platform float differences 30 if str(value).lower().endswith("inf"): 31 return True 32 return False
33 #Hack for Python 2.5 on Windows: 34 try: 35 _nan = float("nan") 36 except ValueError: 37 _nan = 1e1000 / 1e1000
38 39 40 -class GenericPositionMatrix(dict):
41
42 - def __init__(self, alphabet, values):
43 self.length = None 44 for letter in alphabet.letters: 45 if self.length is None: 46 self.length = len(values[letter]) 47 elif self.length!=len(values[letter]): 48 raise Exception("data has inconsistent lengths") 49 self[letter] = list(values[letter]) 50 self.alphabet = alphabet 51 self._letters = sorted(self.alphabet.letters)
52
53 - def __str__(self):
54 words = ["%6d" % i for i in range(self.length)] 55 line = " " + " ".join(words) 56 lines = [line] 57 for letter in self._letters: 58 words = ["%6.2f" % value for value in self[letter]] 59 line = "%c: " % letter + " ".join(words) 60 lines.append(line) 61 text = "\n".join(lines) + "\n" 62 return text
63
64 - def __getitem__(self, key):
65 if isinstance(key, tuple): 66 if len(key)==2: 67 key1, key2 = key 68 if isinstance(key1, slice): 69 start1, stop1, stride1 = key1.indices(len(self._letters)) 70 indices1 = range(start1, stop1, stride1) 71 letters1 = [self._letters[i] for i in indices1] 72 dim1 = 2 73 elif isinstance(key1, int): 74 letter1 = self._letters[key1] 75 dim1 = 1 76 elif isinstance(key1, tuple): 77 letters1 = [self._letters[i] for i in key1] 78 dim1 = 2 79 elif isinstance(key1, str): 80 if len(key1)==1: 81 letter1 = key1 82 dim1 = 1 83 else: 84 raise KeyError(key1) 85 else: 86 raise KeyError("Cannot understand key %s", str(key1)) 87 if isinstance(key2, slice): 88 start2, stop2, stride2 = key2.indices(self.length) 89 indices2 = range(start2, stop2, stride2) 90 dim2 = 2 91 elif isinstance(key2, int): 92 index2 = key2 93 dim2 = 1 94 else: 95 raise KeyError("Cannot understand key %s", str(key2)) 96 if dim1==1 and dim2==1: 97 return dict.__getitem__(self, letter1)[index2] 98 elif dim1==1 and dim2==2: 99 values = dict.__getitem__(self, letter1) 100 return tuple(values[index2] for index2 in indices2) 101 elif dim1==2 and dim2==1: 102 d = {} 103 for letter1 in letters1: 104 d[letter1] = dict.__getitem__(self, letter1)[index2] 105 return d 106 else: 107 d = {} 108 for letter1 in letters1: 109 values = dict.__getitem__(self, letter1) 110 d[letter1] = [values[index2] for index2 in indices2] 111 if sorted(letters1)==self._letters: 112 return self.__class__(self.alphabet, d) 113 else: 114 return d 115 elif len(key)==1: 116 key = key[0] 117 else: 118 raise KeyError("keys should be 1- or 2-dimensional") 119 if isinstance(key, slice): 120 start, stop, stride = key.indices(len(self._letters)) 121 indices = range(start, stop, stride) 122 letters = [self._letters[i] for i in indices] 123 dim = 2 124 elif isinstance(key, int): 125 letter = self._letters[key] 126 dim = 1 127 elif isinstance(key, tuple): 128 letters = [self._letters[i] for i in key] 129 dim = 2 130 elif isinstance(key, str): 131 if len(key)==1: 132 letter = key 133 dim = 1 134 else: 135 raise KeyError(key) 136 else: 137 raise KeyError("Cannot understand key %s", str(key)) 138 if dim==1: 139 return dict.__getitem__(self, letter) 140 elif dim==2: 141 d = {} 142 for letter in letters: 143 d[letter] = dict.__getitem__(self, letter) 144 return d 145 else: 146 raise RuntimeError("Should not get here")
147 148 @property
149 - def consensus(self):
150 """Returns the consensus sequence. 151 """ 152 sequence = "" 153 for i in range(self.length): 154 try: 155 maximum = float("-inf") 156 except ValueError: 157 # On Python 2.5 or older that was handled in C code, 158 # and failed on Windows XP 32bit 159 maximum = - 1E400 160 for letter in self.alphabet.letters: 161 count = self[letter][i] 162 if count > maximum: 163 maximum = count 164 sequence_letter = letter 165 sequence += sequence_letter 166 return Seq(sequence, self.alphabet)
167 168 @property
169 - def anticonsensus(self):
170 sequence = "" 171 for i in range(self.length): 172 try: 173 minimum = float("inf") 174 except ValueError: 175 # On Python 2.5 or older that was handled in C code, 176 # and failed on Windows XP 32bit 177 minimum = 1E400 178 for letter in self.alphabet.letters: 179 count = self[letter][i] 180 if count < minimum: 181 minimum = count 182 sequence_letter = letter 183 sequence += sequence_letter 184 return Seq(sequence, self.alphabet)
185 186 @property
187 - def degenerate_consensus(self):
188 # Following the rules adapted from 189 # D. R. Cavener: "Comparison of the consensus sequence flanking 190 # translational start sites in Drosophila and vertebrates." 191 # Nucleic Acids Research 15(4): 1353-1361. (1987). 192 # The same rules are used by TRANSFAC. 193 degenerate_nucleotide = { 194 'A': 'A', 195 'C': 'C', 196 'G': 'G', 197 'T': 'T', 198 'AC': 'M', 199 'AG': 'R', 200 'AT': 'W', 201 'CG': 'S', 202 'CT': 'Y', 203 'GT': 'K', 204 'ACG': 'V', 205 'ACT': 'H', 206 'AGT': 'D', 207 'CGT': 'B', 208 'ACGT': 'N', 209 } 210 sequence = "" 211 for i in range(self.length): 212 def get(nucleotide): 213 return self[nucleotide][i]
214 nucleotides = sorted(self, key=get, reverse=True) 215 counts = [self[c][i] for c in nucleotides] 216 # Follow the Cavener rules: 217 if counts[0] >= sum(counts[1:]) and counts[0] >= 2*counts[1]: 218 key = nucleotides[0] 219 elif 4*sum(counts[:2]) > 3*sum(counts): 220 key = "".join(sorted(nucleotides[:2])) 221 elif counts[3]==0: 222 key = "".join(sorted(nucleotides[:3])) 223 else: 224 key = "ACGT" 225 nucleotide = degenerate_nucleotide[key] 226 sequence += nucleotide 227 return Seq(sequence, alphabet = IUPAC.ambiguous_dna)
228 229 @property
230 - def gc_content(self):
231 """ 232 Compute the fraction GC content. 233 """ 234 alphabet = self.alphabet 235 gc_total = 0.0 236 total = 0.0 237 for i in range(self.length): 238 for letter in alphabet.letters: 239 if letter in 'CG': 240 gc_total += self[letter][i] 241 total += self[letter][i] 242 return gc_total / total
243
244 - def reverse_complement(self):
245 values = {} 246 values["A"] = self["T"][::-1] 247 values["T"] = self["A"][::-1] 248 values["G"] = self["C"][::-1] 249 values["C"] = self["G"][::-1] 250 alphabet = self.alphabet 251 return self.__class__(alphabet, values)
252
253 254 -class FrequencyPositionMatrix(GenericPositionMatrix):
255
256 - def normalize(self, pseudocounts=None):
257 """ 258 create and return a position-weight matrix by normalizing the counts matrix. 259 260 If pseudocounts is None (default), no pseudocounts are added 261 to the counts. 262 If pseudocounts is a number, it is added to the counts before 263 calculating the position-weight matrix. 264 Alternatively, the pseudocounts can be a dictionary with a key 265 for each letter in the alphabet associated with the motif. 266 """ 267 268 counts = {} 269 if pseudocounts is None: 270 for letter in self.alphabet.letters: 271 counts[letter] = [0.0] * self.length 272 elif isinstance(pseudocounts, dict): 273 for letter in self.alphabet.letters: 274 counts[letter] = [float(pseudocounts[letter])] * self.length 275 else: 276 for letter in self.alphabet.letters: 277 counts[letter] = [float(pseudocounts)] * self.length 278 for i in range(self.length): 279 for letter in self.alphabet.letters: 280 counts[letter][i] += self[letter][i] 281 # Actual normalization is done in the PositionWeightMatrix initializer 282 return PositionWeightMatrix(self.alphabet, counts)
283
284 285 -class PositionWeightMatrix(GenericPositionMatrix):
286
287 - def __init__(self, alphabet, counts):
288 GenericPositionMatrix.__init__(self, alphabet, counts) 289 for i in range(self.length): 290 total = sum(float(self[letter][i]) for letter in alphabet.letters) 291 for letter in alphabet.letters: 292 self[letter][i] /= total 293 for letter in alphabet.letters: 294 self[letter] = tuple(self[letter])
295
296 - def log_odds(self, background=None):
297 """ 298 returns the Position-Specific Scoring Matrix. 299 300 The Position-Specific Scoring Matrix (PSSM) contains the log-odds 301 scores computed from the probability matrix and the background 302 probabilities. If the background is None, a uniform background 303 distribution is assumed. 304 """ 305 values = {} 306 alphabet = self.alphabet 307 if background is None: 308 background = dict.fromkeys(self._letters, 1.0) 309 else: 310 background = dict(background) 311 total = sum(background.values()) 312 for letter in alphabet.letters: 313 background[letter] /= total 314 values[letter] = [] 315 for i in range(self.length): 316 for letter in alphabet.letters: 317 b = background[letter] 318 if b > 0: 319 p = self[letter][i] 320 if p > 0: 321 logodds = math.log(p/b, 2) 322 else: 323 #TODO - Ensure this has unittest coverage! 324 try: 325 logodds = float("-inf") 326 except ValueError: 327 # On Python 2.5 or older that was handled in C code, 328 # and failed on Windows XP 32bit 329 logodds = - 1E400 330 else: 331 p = self[letter][i] 332 if p > 0: 333 logodds = float("inf") 334 else: 335 logodds = _nan 336 values[letter].append(logodds) 337 pssm = PositionSpecificScoringMatrix(alphabet, values) 338 return pssm
339
340 341 -class PositionSpecificScoringMatrix(GenericPositionMatrix):
342
343 - def calculate(self, sequence):
344 """ 345 returns the PWM score for a given sequence for all positions. 346 347 - the sequence can only be a DNA sequence 348 - the search is performed only on one strand 349 - if the sequence and the motif have the same length, a single 350 number is returned 351 - otherwise, the result is a one-dimensional list or numpy array 352 """ 353 #TODO - Code itself tolerates ambiguous bases (as NaN). 354 if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): 355 raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" \ 356 % self.alphabet) 357 if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): 358 raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" \ 359 % sequence.alphabet) 360 361 #TODO - Force uppercase here and optimise switch statement in C 362 #by assuming upper case? 363 sequence = str(sequence) 364 m = self.length 365 n = len(sequence) 366 367 scores = [] 368 # check if the fast C code can be used 369 try: 370 import _pwm 371 except ImportError: 372 # use the slower Python code otherwise 373 #The C code handles mixed case so Python version must too: 374 sequence = sequence.upper() 375 for i in range(n-m+1): 376 score = 0.0 377 for position in range(m): 378 letter = sequence[i+position] 379 try: 380 score += self[letter][position] 381 except KeyError: 382 score = _nan 383 break 384 scores.append(score) 385 else: 386 # get the log-odds matrix into a proper shape 387 # (each row contains sorted (ACGT) log-odds values) 388 logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] 389 scores = _pwm.calculate(sequence, logodds) 390 if len(scores)==1: 391 return scores[0] 392 else: 393 return scores
394
395 - def search(self, sequence, threshold=0.0, both=True):
396 """ 397 a generator function, returning found hits in a given sequence with the pwm score higher than the threshold 398 """ 399 sequence = sequence.upper() 400 n = len(sequence) 401 m = self.length 402 if both: 403 rc = self.reverse_complement() 404 for position in range(0, n-m+1): 405 s = sequence[position:position+m] 406 score = self.calculate(s) 407 if score > threshold: 408 yield (position, score) 409 if both: 410 score = rc.calculate(s) 411 if score > threshold: 412 yield (position-n, score)
413 414 @property
415 - def max(self):
416 """Maximal possible score for this motif. 417 418 returns the score computed for the consensus sequence. 419 """ 420 score = 0.0 421 letters = self._letters 422 for position in range(0, self.length): 423 score += max(self[letter][position] for letter in letters) 424 return score
425 426 @property
427 - def min(self):
428 """Minimal possible score for this motif. 429 430 returns the score computed for the anticonsensus sequence. 431 """ 432 score = 0.0 433 letters = self._letters 434 for position in range(0, self.length): 435 score += min(self[letter][position] for letter in letters) 436 return score
437 438 @property
439 - def gc_content(self):
440 raise Exception("Cannot compute the %GC composition of a PSSM")
441
442 - def mean(self, background=None):
443 """Expected value of the score of a motif.""" 444 if background is None: 445 background = dict.fromkeys(self._letters, 1.0) 446 else: 447 background = dict(background) 448 total = sum(background.values()) 449 for letter in self._letters: 450 background[letter] /= total 451 sx = 0.0 452 for i in range(self.length): 453 for letter in self._letters: 454 logodds = self[letter, i] 455 if _isnan(logodds): 456 continue 457 if _isinf(logodds) and logodds < 0: 458 continue 459 b = background[letter] 460 p = b * math.pow(2, logodds) 461 sx += p * logodds 462 return sx
463
464 - def std(self, background=None):
465 """Standard deviation of the score of a motif.""" 466 if background is None: 467 background = dict.fromkeys(self._letters, 1.0) 468 else: 469 background = dict(background) 470 total = sum(background.values()) 471 for letter in self._letters: 472 background[letter] /= total 473 variance = 0.0 474 for i in range(self.length): 475 sx = 0.0 476 sxx = 0.0 477 for letter in self._letters: 478 logodds = self[letter, i] 479 if _isnan(logodds): 480 continue 481 if _isinf(logodds) and logodds < 0: 482 continue 483 b = background[letter] 484 p = b * math.pow(2, logodds) 485 sx += p*logodds 486 sxx += p*logodds*logodds 487 sxx -= sx*sx 488 variance += sxx 489 variance = max(variance, 0) # to avoid roundoff problems 490 return math.sqrt(variance)
491
492 - def dist_pearson(self, other):
493 """ 494 return the similarity score based on pearson correlation for the given motif against self. 495 496 We use the Pearson's correlation of the respective probabilities. 497 """ 498 if self.alphabet != other.alphabet: 499 raise ValueError("Cannot compare motifs with different alphabets") 500 501 max_p=-2 502 for offset in range(-self.length+1, other.length): 503 if offset<0: 504 p = self.dist_pearson_at(other, -offset) 505 else: # offset>=0 506 p = other.dist_pearson_at(self, offset) 507 if max_p<p: 508 max_p=p 509 max_o=-offset 510 return 1-max_p, max_o
511
512 - def dist_pearson_at(self, other, offset):
513 letters = self._letters 514 sx = 0.0 # \sum x 515 sy = 0.0 # \sum y 516 sxx = 0.0 # \sum x^2 517 sxy = 0.0 # \sum x \cdot y 518 syy = 0.0 # \sum y^2 519 norm=max(self.length, offset+other.length)*len(letters) 520 for pos in range(min(self.length-offset, other.length)): 521 xi = [self[letter, pos+offset] for letter in letters] 522 yi = [other[letter, pos] for letter in letters] 523 sx += sum(xi) 524 sy += sum(yi) 525 sxx += sum(x*x for x in xi) 526 sxy += sum(x*y for x, y in zip(xi, yi)) 527 syy += sum(y*y for y in yi) 528 sx /= norm 529 sy /= norm 530 sxx /= norm 531 sxy /= norm 532 syy /= norm 533 numerator = sxy - sx*sy 534 denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy)) 535 return numerator/denominator
536
537 - def distribution(self, background=None, precision=10**3):
538 """calculate the distribution of the scores at the given precision.""" 539 from .thresholds import ScoreDistribution 540 if background is None: 541 background = dict.fromkeys(self._letters, 1.0) 542 else: 543 background = dict(background) 544 total = sum(background.values()) 545 for letter in self._letters: 546 background[letter] /= total 547 return ScoreDistribution(precision=precision, pssm=self, background=background)
548