Package Bio :: Package motifs :: Module matrix
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.matrix

  1  # Copyright 2013 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Implementation of frequency (count) matrices, position-weight matrices, 
  6  and position-specific scoring matrices. 
  7  """ 
  8   
  9  import math 
 10  import platform 
 11   
 12  from Bio._py3k import range 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.Alphabet import IUPAC 
 16  from Bio import Alphabet 
17 18 19 -class GenericPositionMatrix(dict):
20
21 - def __init__(self, alphabet, values):
22 self.length = None 23 for letter in alphabet.letters: 24 if self.length is None: 25 self.length = len(values[letter]) 26 elif self.length != len(values[letter]): 27 raise Exception("data has inconsistent lengths") 28 self[letter] = list(values[letter]) 29 self.alphabet = alphabet 30 self._letters = sorted(self.alphabet.letters)
31
32 - def __str__(self):
33 words = ["%6d" % i for i in range(self.length)] 34 line = " " + " ".join(words) 35 lines = [line] 36 for letter in self._letters: 37 words = ["%6.2f" % value for value in self[letter]] 38 line = "%c: " % letter + " ".join(words) 39 lines.append(line) 40 text = "\n".join(lines) + "\n" 41 return text
42
43 - def __getitem__(self, key):
44 if isinstance(key, tuple): 45 if len(key) == 2: 46 key1, key2 = key 47 if isinstance(key1, slice): 48 start1, stop1, stride1 = key1.indices(len(self._letters)) 49 indices1 = range(start1, stop1, stride1) 50 letters1 = [self._letters[i] for i in indices1] 51 dim1 = 2 52 elif isinstance(key1, int): 53 letter1 = self._letters[key1] 54 dim1 = 1 55 elif isinstance(key1, tuple): 56 letters1 = [self._letters[i] for i in key1] 57 dim1 = 2 58 elif isinstance(key1, str): 59 if len(key1) == 1: 60 letter1 = key1 61 dim1 = 1 62 else: 63 raise KeyError(key1) 64 else: 65 raise KeyError("Cannot understand key %s", str(key1)) 66 if isinstance(key2, slice): 67 start2, stop2, stride2 = key2.indices(self.length) 68 indices2 = range(start2, stop2, stride2) 69 dim2 = 2 70 elif isinstance(key2, int): 71 index2 = key2 72 dim2 = 1 73 else: 74 raise KeyError("Cannot understand key %s", str(key2)) 75 if dim1 == 1 and dim2 == 1: 76 return dict.__getitem__(self, letter1)[index2] 77 elif dim1 == 1 and dim2 == 2: 78 values = dict.__getitem__(self, letter1) 79 return tuple(values[index2] for index2 in indices2) 80 elif dim1 == 2 and dim2 == 1: 81 d = {} 82 for letter1 in letters1: 83 d[letter1] = dict.__getitem__(self, letter1)[index2] 84 return d 85 else: 86 d = {} 87 for letter1 in letters1: 88 values = dict.__getitem__(self, letter1) 89 d[letter1] = [values[index2] for index2 in indices2] 90 if sorted(letters1) == self._letters: 91 return self.__class__(self.alphabet, d) 92 else: 93 return d 94 elif len(key) == 1: 95 key = key[0] 96 else: 97 raise KeyError("keys should be 1- or 2-dimensional") 98 if isinstance(key, slice): 99 start, stop, stride = key.indices(len(self._letters)) 100 indices = range(start, stop, stride) 101 letters = [self._letters[i] for i in indices] 102 dim = 2 103 elif isinstance(key, int): 104 letter = self._letters[key] 105 dim = 1 106 elif isinstance(key, tuple): 107 letters = [self._letters[i] for i in key] 108 dim = 2 109 elif isinstance(key, str): 110 if len(key) == 1: 111 letter = key 112 dim = 1 113 else: 114 raise KeyError(key) 115 else: 116 raise KeyError("Cannot understand key %s", str(key)) 117 if dim == 1: 118 return dict.__getitem__(self, letter) 119 elif dim == 2: 120 d = {} 121 for letter in letters: 122 d[letter] = dict.__getitem__(self, letter) 123 return d 124 else: 125 raise RuntimeError("Should not get here")
126 127 @property
128 - def consensus(self):
129 """Returns the consensus sequence.""" 130 sequence = "" 131 for i in range(self.length): 132 try: 133 maximum = float("-inf") 134 except ValueError: 135 # On Python 2.5 or older that was handled in C code, 136 # and failed on Windows XP 32bit 137 maximum = - 1E400 138 for letter in self.alphabet.letters: 139 count = self[letter][i] 140 if count > maximum: 141 maximum = count 142 sequence_letter = letter 143 sequence += sequence_letter 144 return Seq(sequence, self.alphabet)
145 146 @property
147 - def anticonsensus(self):
148 sequence = "" 149 for i in range(self.length): 150 try: 151 minimum = float("inf") 152 except ValueError: 153 # On Python 2.5 or older that was handled in C code, 154 # and failed on Windows XP 32bit 155 minimum = 1E400 156 for letter in self.alphabet.letters: 157 count = self[letter][i] 158 if count < minimum: 159 minimum = count 160 sequence_letter = letter 161 sequence += sequence_letter 162 return Seq(sequence, self.alphabet)
163 164 @property
165 - def degenerate_consensus(self):
166 # Following the rules adapted from 167 # D. R. Cavener: "Comparison of the consensus sequence flanking 168 # translational start sites in Drosophila and vertebrates." 169 # Nucleic Acids Research 15(4): 1353-1361. (1987). 170 # The same rules are used by TRANSFAC. 171 degenerate_nucleotide = { 172 'A': 'A', 173 'C': 'C', 174 'G': 'G', 175 'T': 'T', 176 'AC': 'M', 177 'AG': 'R', 178 'AT': 'W', 179 'CG': 'S', 180 'CT': 'Y', 181 'GT': 'K', 182 'ACG': 'V', 183 'ACT': 'H', 184 'AGT': 'D', 185 'CGT': 'B', 186 'ACGT': 'N', 187 } 188 sequence = "" 189 for i in range(self.length): 190 def get(nucleotide): 191 return self[nucleotide][i]
192 nucleotides = sorted(self, key=get, reverse=True) 193 counts = [self[c][i] for c in nucleotides] 194 # Follow the Cavener rules: 195 if counts[0] >= sum(counts[1:]) and counts[0] >= 2 * counts[1]: 196 key = nucleotides[0] 197 elif 4 * sum(counts[:2]) > 3 * sum(counts): 198 key = "".join(sorted(nucleotides[:2])) 199 elif counts[3] == 0: 200 key = "".join(sorted(nucleotides[:3])) 201 else: 202 key = "ACGT" 203 nucleotide = degenerate_nucleotide.get(key, key) 204 sequence += nucleotide 205 if isinstance(self.alphabet, Alphabet.DNAAlphabet): 206 alpha = IUPAC.ambiguous_dna 207 elif isinstance(self.alphabet, Alphabet.RNAAlphabet): 208 alpha = IUPAC.ambiguous_rna 209 elif isinstance(self.alphabet, Alphabet.ProteinAlphabet): 210 alpha = IUPAC.protein 211 else: 212 raise Exception("Unknown alphabet") 213 return Seq(sequence, alphabet=alpha)
214 215 @property
216 - def gc_content(self):
217 """Compute the fraction GC content.""" 218 alphabet = self.alphabet 219 gc_total = 0.0 220 total = 0.0 221 for i in range(self.length): 222 for letter in alphabet.letters: 223 if letter in 'CG': 224 gc_total += self[letter][i] 225 total += self[letter][i] 226 return gc_total / total
227
228 - def reverse_complement(self):
229 values = {} 230 values["A"] = self["T"][::-1] 231 values["T"] = self["A"][::-1] 232 values["G"] = self["C"][::-1] 233 values["C"] = self["G"][::-1] 234 alphabet = self.alphabet 235 return self.__class__(alphabet, values)
236
237 238 -class FrequencyPositionMatrix(GenericPositionMatrix):
239
240 - def normalize(self, pseudocounts=None):
241 """Create and return a position-weight matrix by normalizing the counts matrix. 242 243 If pseudocounts is None (default), no pseudocounts are added 244 to the counts. 245 246 If pseudocounts is a number, it is added to the counts before 247 calculating the position-weight matrix. 248 249 Alternatively, the pseudocounts can be a dictionary with a key 250 for each letter in the alphabet associated with the motif. 251 """ 252 counts = {} 253 if pseudocounts is None: 254 for letter in self.alphabet.letters: 255 counts[letter] = [0.0] * self.length 256 elif isinstance(pseudocounts, dict): 257 for letter in self.alphabet.letters: 258 counts[letter] = [float(pseudocounts[letter])] * self.length 259 else: 260 for letter in self.alphabet.letters: 261 counts[letter] = [float(pseudocounts)] * self.length 262 for i in range(self.length): 263 for letter in self.alphabet.letters: 264 counts[letter][i] += self[letter][i] 265 # Actual normalization is done in the PositionWeightMatrix initializer 266 return PositionWeightMatrix(self.alphabet, counts)
267
268 269 -class PositionWeightMatrix(GenericPositionMatrix):
270
271 - def __init__(self, alphabet, counts):
272 GenericPositionMatrix.__init__(self, alphabet, counts) 273 for i in range(self.length): 274 total = sum(float(self[letter][i]) for letter in alphabet.letters) 275 for letter in alphabet.letters: 276 self[letter][i] /= total 277 for letter in alphabet.letters: 278 self[letter] = tuple(self[letter])
279
280 - def log_odds(self, background=None):
281 """Returns the Position-Specific Scoring Matrix. 282 283 The Position-Specific Scoring Matrix (PSSM) contains the log-odds 284 scores computed from the probability matrix and the background 285 probabilities. If the background is None, a uniform background 286 distribution is assumed. 287 """ 288 values = {} 289 alphabet = self.alphabet 290 if background is None: 291 background = dict.fromkeys(self._letters, 1.0) 292 else: 293 background = dict(background) 294 total = sum(background.values()) 295 for letter in alphabet.letters: 296 background[letter] /= total 297 values[letter] = [] 298 for i in range(self.length): 299 for letter in alphabet.letters: 300 b = background[letter] 301 if b > 0: 302 p = self[letter][i] 303 if p > 0: 304 logodds = math.log(p / b, 2) 305 else: 306 # TODO - Ensure this has unittest coverage! 307 try: 308 logodds = float("-inf") 309 except ValueError: 310 # On Python 2.5 or older that was handled in C code, 311 # and failed on Windows XP 32bit 312 logodds = - 1E400 313 else: 314 p = self[letter][i] 315 if p > 0: 316 logodds = float("inf") 317 else: 318 logodds = float("nan") 319 values[letter].append(logodds) 320 pssm = PositionSpecificScoringMatrix(alphabet, values) 321 return pssm
322
323 324 -class PositionSpecificScoringMatrix(GenericPositionMatrix):
325 326 # Make sure that we use C-accelerated PWM calculations if running under CPython. 327 # Fall back to the slower Python implementation if Jython or IronPython. 328 try: 329 from . import _pwm 330
331 - def _calculate(self, sequence, m, n):
332 logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] 333 return self._pwm.calculate(sequence, logodds)
334 except ImportError: 335 if platform.python_implementation() == 'CPython': 336 raise 337 else:
338 - def _calculate(self, sequence, m, n):
339 # The C code handles mixed case so Python version must too: 340 sequence = sequence.upper() 341 scores = [] 342 for i in range(n - m + 1): 343 score = 0.0 344 for position in range(m): 345 letter = sequence[i + position] 346 try: 347 score += self[letter][position] 348 except KeyError: 349 score = float("nan") 350 break 351 scores.append(score) 352 return scores
353
354 - def calculate(self, sequence):
355 """Returns the PWM score for a given sequence for all positions. 356 357 Notes: 358 359 - the sequence can only be a DNA sequence 360 - the search is performed only on one strand 361 - if the sequence and the motif have the same length, a single 362 number is returned 363 - otherwise, the result is a one-dimensional list or numpy array 364 """ 365 # TODO - Code itself tolerates ambiguous bases (as NaN). 366 if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): 367 raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" 368 % self.alphabet) 369 if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): 370 raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" 371 % sequence.alphabet) 372 373 # TODO - Force uppercase here and optimise switch statement in C 374 # by assuming upper case? 375 sequence = str(sequence) 376 m = self.length 377 n = len(sequence) 378 379 scores = self._calculate(sequence, m, n) 380 381 if len(scores) == 1: 382 return scores[0] 383 else: 384 return scores
385
386 - def search(self, sequence, threshold=0.0, both=True):
387 """Find hits with PWM score above given threshold. 388 389 A generator function, returning found hits in the given sequence 390 with the pwm score higher than the threshold. 391 """ 392 sequence = sequence.upper() 393 n = len(sequence) 394 m = self.length 395 if both: 396 rc = self.reverse_complement() 397 for position in range(0, n - m + 1): 398 s = sequence[position:position + m] 399 score = self.calculate(s) 400 if score > threshold: 401 yield (position, score) 402 if both: 403 score = rc.calculate(s) 404 if score > threshold: 405 yield (position - n, score)
406 407 @property
408 - def max(self):
409 """Maximal possible score for this motif. 410 411 returns the score computed for the consensus sequence. 412 """ 413 score = 0.0 414 letters = self._letters 415 for position in range(0, self.length): 416 score += max(self[letter][position] for letter in letters) 417 return score
418 419 @property
420 - def min(self):
421 """Minimal possible score for this motif. 422 423 returns the score computed for the anticonsensus sequence. 424 """ 425 score = 0.0 426 letters = self._letters 427 for position in range(0, self.length): 428 score += min(self[letter][position] for letter in letters) 429 return score
430 431 @property
432 - def gc_content(self):
433 raise Exception("Cannot compute the %GC composition of a PSSM")
434
435 - def mean(self, background=None):
436 """Expected value of the score of a motif.""" 437 if background is None: 438 background = dict.fromkeys(self._letters, 1.0) 439 else: 440 background = dict(background) 441 total = sum(background.values()) 442 for letter in self._letters: 443 background[letter] /= total 444 sx = 0.0 445 for i in range(self.length): 446 for letter in self._letters: 447 logodds = self[letter, i] 448 if math.isnan(logodds): 449 continue 450 if math.isinf(logodds) and logodds < 0: 451 continue 452 b = background[letter] 453 p = b * math.pow(2, logodds) 454 sx += p * logodds 455 return sx
456
457 - def std(self, background=None):
458 """Standard deviation of the score of a motif.""" 459 if background is None: 460 background = dict.fromkeys(self._letters, 1.0) 461 else: 462 background = dict(background) 463 total = sum(background.values()) 464 for letter in self._letters: 465 background[letter] /= total 466 variance = 0.0 467 for i in range(self.length): 468 sx = 0.0 469 sxx = 0.0 470 for letter in self._letters: 471 logodds = self[letter, i] 472 if math.isnan(logodds): 473 continue 474 if math.isinf(logodds) and logodds < 0: 475 continue 476 b = background[letter] 477 p = b * math.pow(2, logodds) 478 sx += p * logodds 479 sxx += p * logodds * logodds 480 sxx -= sx * sx 481 variance += sxx 482 variance = max(variance, 0) # to avoid roundoff problems 483 return math.sqrt(variance)
484
485 - def dist_pearson(self, other):
486 """Return the similarity score based on pearson correlation for the given motif against self. 487 488 We use the Pearson's correlation of the respective probabilities. 489 """ 490 if self.alphabet != other.alphabet: 491 raise ValueError("Cannot compare motifs with different alphabets") 492 493 max_p = -2 494 for offset in range(-self.length + 1, other.length): 495 if offset < 0: 496 p = self.dist_pearson_at(other, -offset) 497 else: # offset>=0 498 p = other.dist_pearson_at(self, offset) 499 if max_p < p: 500 max_p = p 501 max_o = -offset 502 return 1 - max_p, max_o
503
504 - def dist_pearson_at(self, other, offset):
505 letters = self._letters 506 sx = 0.0 # \sum x 507 sy = 0.0 # \sum y 508 sxx = 0.0 # \sum x^2 509 sxy = 0.0 # \sum x \cdot y 510 syy = 0.0 # \sum y^2 511 norm = max(self.length, offset + other.length) * len(letters) 512 for pos in range(min(self.length - offset, other.length)): 513 xi = [self[letter, pos + offset] for letter in letters] 514 yi = [other[letter, pos] for letter in letters] 515 sx += sum(xi) 516 sy += sum(yi) 517 sxx += sum(x * x for x in xi) 518 sxy += sum(x * y for x, y in zip(xi, yi)) 519 syy += sum(y * y for y in yi) 520 sx /= norm 521 sy /= norm 522 sxx /= norm 523 sxy /= norm 524 syy /= norm 525 numerator = sxy - sx * sy 526 denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy)) 527 return numerator / denominator
528
529 - def distribution(self, background=None, precision=10 ** 3):
530 """calculate the distribution of the scores at the given precision.""" 531 from .thresholds import ScoreDistribution 532 if background is None: 533 background = dict.fromkeys(self._letters, 1.0) 534 else: 535 background = dict(background) 536 total = sum(background.values()) 537 for letter in self._letters: 538 background[letter] /= total 539 return ScoreDistribution(precision=precision, pssm=self, background=background)
540