Package Bio :: Package motifs :: Module matrix
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.matrix

  1  # Copyright 2013 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Implementation of frequency (count) matrices, position-weight matrices, 
  6  and position-specific scoring matrices. 
  7  """ 
  8   
  9  import math 
 10  import platform 
 11   
 12  from Bio._py3k import range 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.Alphabet import IUPAC 
 16  from Bio import Alphabet 
17 18 19 -class GenericPositionMatrix(dict):
20
21 - def __init__(self, alphabet, values):
22 self.length = None 23 for letter in alphabet.letters: 24 if self.length is None: 25 self.length = len(values[letter]) 26 elif self.length != len(values[letter]): 27 raise Exception("data has inconsistent lengths") 28 self[letter] = list(values[letter]) 29 self.alphabet = alphabet 30 self._letters = sorted(self.alphabet.letters)
31
32 - def __str__(self):
33 words = ["%6d" % i for i in range(self.length)] 34 line = " " + " ".join(words) 35 lines = [line] 36 for letter in self._letters: 37 words = ["%6.2f" % value for value in self[letter]] 38 line = "%c: " % letter + " ".join(words) 39 lines.append(line) 40 text = "\n".join(lines) + "\n" 41 return text
42
43 - def __getitem__(self, key):
44 if isinstance(key, tuple): 45 if len(key) == 2: 46 key1, key2 = key 47 if isinstance(key1, slice): 48 start1, stop1, stride1 = key1.indices(len(self._letters)) 49 indices1 = range(start1, stop1, stride1) 50 letters1 = [self._letters[i] for i in indices1] 51 dim1 = 2 52 elif isinstance(key1, int): 53 letter1 = self._letters[key1] 54 dim1 = 1 55 elif isinstance(key1, tuple): 56 letters1 = [self._letters[i] for i in key1] 57 dim1 = 2 58 elif isinstance(key1, str): 59 if len(key1) == 1: 60 letter1 = key1 61 dim1 = 1 62 else: 63 raise KeyError(key1) 64 else: 65 raise KeyError("Cannot understand key %s", str(key1)) 66 if isinstance(key2, slice): 67 start2, stop2, stride2 = key2.indices(self.length) 68 indices2 = range(start2, stop2, stride2) 69 dim2 = 2 70 elif isinstance(key2, int): 71 index2 = key2 72 dim2 = 1 73 else: 74 raise KeyError("Cannot understand key %s", str(key2)) 75 if dim1 == 1 and dim2 == 1: 76 return dict.__getitem__(self, letter1)[index2] 77 elif dim1 == 1 and dim2 == 2: 78 values = dict.__getitem__(self, letter1) 79 return tuple(values[index2] for index2 in indices2) 80 elif dim1 == 2 and dim2 == 1: 81 d = {} 82 for letter1 in letters1: 83 d[letter1] = dict.__getitem__(self, letter1)[index2] 84 return d 85 else: 86 d = {} 87 for letter1 in letters1: 88 values = dict.__getitem__(self, letter1) 89 d[letter1] = [values[index2] for index2 in indices2] 90 if sorted(letters1) == self._letters: 91 return self.__class__(self.alphabet, d) 92 else: 93 return d 94 elif len(key) == 1: 95 key = key[0] 96 else: 97 raise KeyError("keys should be 1- or 2-dimensional") 98 if isinstance(key, slice): 99 start, stop, stride = key.indices(len(self._letters)) 100 indices = range(start, stop, stride) 101 letters = [self._letters[i] for i in indices] 102 dim = 2 103 elif isinstance(key, int): 104 letter = self._letters[key] 105 dim = 1 106 elif isinstance(key, tuple): 107 letters = [self._letters[i] for i in key] 108 dim = 2 109 elif isinstance(key, str): 110 if len(key) == 1: 111 letter = key 112 dim = 1 113 else: 114 raise KeyError(key) 115 else: 116 raise KeyError("Cannot understand key %s", str(key)) 117 if dim == 1: 118 return dict.__getitem__(self, letter) 119 elif dim == 2: 120 d = {} 121 for letter in letters: 122 d[letter] = dict.__getitem__(self, letter) 123 return d 124 else: 125 raise RuntimeError("Should not get here")
126 127 @property
128 - def consensus(self):
129 """Returns the consensus sequence.""" 130 sequence = "" 131 for i in range(self.length): 132 try: 133 maximum = float("-inf") 134 except ValueError: 135 # On Python 2.5 or older that was handled in C code, 136 # and failed on Windows XP 32bit 137 maximum = - 1E400 138 for letter in self.alphabet.letters: 139 count = self[letter][i] 140 if count > maximum: 141 maximum = count 142 sequence_letter = letter 143 sequence += sequence_letter 144 return Seq(sequence, self.alphabet)
145 146 @property
147 - def anticonsensus(self):
148 sequence = "" 149 for i in range(self.length): 150 try: 151 minimum = float("inf") 152 except ValueError: 153 # On Python 2.5 or older that was handled in C code, 154 # and failed on Windows XP 32bit 155 minimum = 1E400 156 for letter in self.alphabet.letters: 157 count = self[letter][i] 158 if count < minimum: 159 minimum = count 160 sequence_letter = letter 161 sequence += sequence_letter 162 return Seq(sequence, self.alphabet)
163 164 @property
165 - def degenerate_consensus(self):
166 # Following the rules adapted from 167 # D. R. Cavener: "Comparison of the consensus sequence flanking 168 # translational start sites in Drosophila and vertebrates." 169 # Nucleic Acids Research 15(4): 1353-1361. (1987). 170 # The same rules are used by TRANSFAC. 171 degenerate_nucleotide = { 172 'A': 'A', 173 'C': 'C', 174 'G': 'G', 175 'T': 'T', 176 'AC': 'M', 177 'AG': 'R', 178 'AT': 'W', 179 'CG': 'S', 180 'CT': 'Y', 181 'GT': 'K', 182 'ACG': 'V', 183 'ACT': 'H', 184 'AGT': 'D', 185 'CGT': 'B', 186 'ACGT': 'N', 187 } 188 sequence = "" 189 for i in range(self.length): 190 def get(nucleotide): 191 return self[nucleotide][i]
192 nucleotides = sorted(self, key=get, reverse=True) 193 counts = [self[c][i] for c in nucleotides] 194 # Follow the Cavener rules: 195 if counts[0] >= sum(counts[1:]) and counts[0] >= 2 * counts[1]: 196 key = nucleotides[0] 197 elif 4 * sum(counts[:2]) > 3 * sum(counts): 198 key = "".join(sorted(nucleotides[:2])) 199 elif counts[3] == 0: 200 key = "".join(sorted(nucleotides[:3])) 201 else: 202 key = "ACGT" 203 nucleotide = degenerate_nucleotide.get(key, key) 204 sequence += nucleotide 205 if isinstance(self.alphabet, Alphabet.DNAAlphabet): 206 alpha = IUPAC.ambiguous_dna 207 elif isinstance(self.alphabet, Alphabet.RNAAlphabet): 208 alpha = IUPAC.ambiguous_rna 209 elif isinstance(self.alphabet, Alphabet.ProteinAlphabet): 210 alpha = IUPAC.protein 211 else: 212 raise Exception("Unknown alphabet") 213 return Seq(sequence, alphabet=alpha)
214 215 @property
216 - def gc_content(self):
217 """Compute the fraction GC content.""" 218 alphabet = self.alphabet 219 gc_total = 0.0 220 total = 0.0 221 for i in range(self.length): 222 for letter in alphabet.letters: 223 if letter in 'CG': 224 gc_total += self[letter][i] 225 total += self[letter][i] 226 return gc_total / total
227
228 - def reverse_complement(self):
229 values = {} 230 values["A"] = self["T"][::-1] 231 values["T"] = self["A"][::-1] 232 values["G"] = self["C"][::-1] 233 values["C"] = self["G"][::-1] 234 alphabet = self.alphabet 235 return self.__class__(alphabet, values)
236
237 238 -class FrequencyPositionMatrix(GenericPositionMatrix):
239
240 - def normalize(self, pseudocounts=None):
241 """Create and return a position-weight matrix by normalizing the counts matrix. 242 243 If pseudocounts is None (default), no pseudocounts are added 244 to the counts. 245 246 If pseudocounts is a number, it is added to the counts before 247 calculating the position-weight matrix. 248 249 Alternatively, the pseudocounts can be a dictionary with a key 250 for each letter in the alphabet associated with the motif. 251 """ 252 counts = {} 253 if pseudocounts is None: 254 for letter in self.alphabet.letters: 255 counts[letter] = [0.0] * self.length 256 elif isinstance(pseudocounts, dict): 257 for letter in self.alphabet.letters: 258 counts[letter] = [float(pseudocounts[letter])] * self.length 259 else: 260 for letter in self.alphabet.letters: 261 counts[letter] = [float(pseudocounts)] * self.length 262 for i in range(self.length): 263 for letter in self.alphabet.letters: 264 counts[letter][i] += self[letter][i] 265 # Actual normalization is done in the PositionWeightMatrix initializer 266 return PositionWeightMatrix(self.alphabet, counts)
267
268 269 -class PositionWeightMatrix(GenericPositionMatrix):
270
271 - def __init__(self, alphabet, counts):
272 GenericPositionMatrix.__init__(self, alphabet, counts) 273 for i in range(self.length): 274 total = sum(float(self[letter][i]) for letter in alphabet.letters) 275 for letter in alphabet.letters: 276 self[letter][i] /= total 277 for letter in alphabet.letters: 278 self[letter] = tuple(self[letter])
279
280 - def log_odds(self, background=None):
281 """Returns the Position-Specific Scoring Matrix. 282 283 The Position-Specific Scoring Matrix (PSSM) contains the log-odds 284 scores computed from the probability matrix and the background 285 probabilities. If the background is None, a uniform background 286 distribution is assumed. 287 """ 288 values = {} 289 alphabet = self.alphabet 290 if background is None: 291 background = dict.fromkeys(self._letters, 1.0) 292 else: 293 background = dict(background) 294 total = sum(background.values()) 295 for letter in alphabet.letters: 296 background[letter] /= total 297 values[letter] = [] 298 for i in range(self.length): 299 for letter in alphabet.letters: 300 b = background[letter] 301 if b > 0: 302 p = self[letter][i] 303 if p > 0: 304 logodds = math.log(p / b, 2) 305 else: 306 # TODO - Ensure this has unittest coverage! 307 try: 308 logodds = float("-inf") 309 except ValueError: 310 # On Python 2.5 or older that was handled in C code, 311 # and failed on Windows XP 32bit 312 logodds = - 1E400 313 else: 314 p = self[letter][i] 315 if p > 0: 316 logodds = float("inf") 317 else: 318 logodds = float("nan") 319 values[letter].append(logodds) 320 pssm = PositionSpecificScoringMatrix(alphabet, values) 321 return pssm
322
323 324 -class PositionSpecificScoringMatrix(GenericPositionMatrix):
325 326 # Make sure that we use C-accelerated PWM calculations if running under CPython. 327 # Fall back to the slower Python implementation if Jython or IronPython. 328 try: 329 from . import _pwm 330
331 - def _calculate(self, sequence, m, n):
332 logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] 333 return self._pwm.calculate(sequence, logodds)
334 except ImportError: 335 if platform.python_implementation() == 'CPython': 336 raise 337 else:
338 - def _calculate(self, sequence, m, n):
339 # The C code handles mixed case so Python version must too: 340 sequence = sequence.upper() 341 scores = [] 342 for i in range(n - m + 1): 343 score = 0.0 344 for position in range(m): 345 letter = sequence[i + position] 346 try: 347 score += self[letter][position] 348 except KeyError: 349 score = float("nan") 350 break 351 scores.append(score) 352 return scores
353
354 - def calculate(self, sequence):
355 """Returns the PWM score for a given sequence for all positions. 356 357 Notes: 358 359 - the sequence can only be a DNA sequence 360 - the search is performed only on one strand 361 - if the sequence and the motif have the same length, a single 362 number is returned 363 - otherwise, the result is a one-dimensional list or numpy array 364 """ 365 # TODO - Code itself tolerates ambiguous bases (as NaN). 366 if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): 367 raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" 368 % self.alphabet) 369 if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): 370 raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" 371 % sequence.alphabet) 372 373 # NOTE: The C code handles mixed case input as this could be large 374 # (e.g. contig or chromosome), so requiring it be all upper or lower 375 # case would impose an overhead to allocate the extra memory. 376 sequence = str(sequence) 377 m = self.length 378 n = len(sequence) 379 380 scores = self._calculate(sequence, m, n) 381 382 if len(scores) == 1: 383 return scores[0] 384 else: 385 return scores
386
387 - def search(self, sequence, threshold=0.0, both=True):
388 """Find hits with PWM score above given threshold. 389 390 A generator function, returning found hits in the given sequence 391 with the pwm score higher than the threshold. 392 """ 393 sequence = sequence.upper() 394 n = len(sequence) 395 m = self.length 396 if both: 397 rc = self.reverse_complement() 398 for position in range(0, n - m + 1): 399 s = sequence[position:position + m] 400 score = self.calculate(s) 401 if score > threshold: 402 yield (position, score) 403 if both: 404 score = rc.calculate(s) 405 if score > threshold: 406 yield (position - n, score)
407 408 @property
409 - def max(self):
410 """Maximal possible score for this motif. 411 412 returns the score computed for the consensus sequence. 413 """ 414 score = 0.0 415 letters = self._letters 416 for position in range(0, self.length): 417 score += max(self[letter][position] for letter in letters) 418 return score
419 420 @property
421 - def min(self):
422 """Minimal possible score for this motif. 423 424 returns the score computed for the anticonsensus sequence. 425 """ 426 score = 0.0 427 letters = self._letters 428 for position in range(0, self.length): 429 score += min(self[letter][position] for letter in letters) 430 return score
431 432 @property
433 - def gc_content(self):
434 raise Exception("Cannot compute the %GC composition of a PSSM")
435
436 - def mean(self, background=None):
437 """Expected value of the score of a motif.""" 438 if background is None: 439 background = dict.fromkeys(self._letters, 1.0) 440 else: 441 background = dict(background) 442 total = sum(background.values()) 443 for letter in self._letters: 444 background[letter] /= total 445 sx = 0.0 446 for i in range(self.length): 447 for letter in self._letters: 448 logodds = self[letter, i] 449 if math.isnan(logodds): 450 continue 451 if math.isinf(logodds) and logodds < 0: 452 continue 453 b = background[letter] 454 p = b * math.pow(2, logodds) 455 sx += p * logodds 456 return sx
457
458 - def std(self, background=None):
459 """Standard deviation of the score of a motif.""" 460 if background is None: 461 background = dict.fromkeys(self._letters, 1.0) 462 else: 463 background = dict(background) 464 total = sum(background.values()) 465 for letter in self._letters: 466 background[letter] /= total 467 variance = 0.0 468 for i in range(self.length): 469 sx = 0.0 470 sxx = 0.0 471 for letter in self._letters: 472 logodds = self[letter, i] 473 if math.isnan(logodds): 474 continue 475 if math.isinf(logodds) and logodds < 0: 476 continue 477 b = background[letter] 478 p = b * math.pow(2, logodds) 479 sx += p * logodds 480 sxx += p * logodds * logodds 481 sxx -= sx * sx 482 variance += sxx 483 variance = max(variance, 0) # to avoid roundoff problems 484 return math.sqrt(variance)
485
486 - def dist_pearson(self, other):
487 """Return the similarity score based on pearson correlation for the given motif against self. 488 489 We use the Pearson's correlation of the respective probabilities. 490 """ 491 if self.alphabet != other.alphabet: 492 raise ValueError("Cannot compare motifs with different alphabets") 493 494 max_p = -2 495 for offset in range(-self.length + 1, other.length): 496 if offset < 0: 497 p = self.dist_pearson_at(other, -offset) 498 else: # offset>=0 499 p = other.dist_pearson_at(self, offset) 500 if max_p < p: 501 max_p = p 502 max_o = -offset 503 return 1 - max_p, max_o
504
505 - def dist_pearson_at(self, other, offset):
506 letters = self._letters 507 sx = 0.0 # \sum x 508 sy = 0.0 # \sum y 509 sxx = 0.0 # \sum x^2 510 sxy = 0.0 # \sum x \cdot y 511 syy = 0.0 # \sum y^2 512 norm = max(self.length, offset + other.length) * len(letters) 513 for pos in range(min(self.length - offset, other.length)): 514 xi = [self[letter, pos + offset] for letter in letters] 515 yi = [other[letter, pos] for letter in letters] 516 sx += sum(xi) 517 sy += sum(yi) 518 sxx += sum(x * x for x in xi) 519 sxy += sum(x * y for x, y in zip(xi, yi)) 520 syy += sum(y * y for y in yi) 521 sx /= norm 522 sy /= norm 523 sxx /= norm 524 sxy /= norm 525 syy /= norm 526 numerator = sxy - sx * sy 527 denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy)) 528 return numerator / denominator
529
530 - def distribution(self, background=None, precision=10 ** 3):
531 """calculate the distribution of the scores at the given precision.""" 532 from .thresholds import ScoreDistribution 533 if background is None: 534 background = dict.fromkeys(self._letters, 1.0) 535 else: 536 background = dict(background) 537 total = sum(background.values()) 538 for letter in self._letters: 539 background[letter] /= total 540 return ScoreDistribution(precision=precision, pssm=self, background=background)
541