Package Bio :: Package motifs :: Module matrix
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.matrix

  1  # Copyright 2013 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Implementation of frequency (count) matrices, position-weight matrices, 
  6  and position-specific scoring matrices. 
  7  """ 
  8   
  9  import math 
 10  import platform 
 11   
 12  from Bio._py3k import range 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.Alphabet import IUPAC 
 16  from Bio import Alphabet 
 17   
 18   
 19  # Make sure that we use C-accelerated PWM calculations if running under CPython. 
 20  # Fall back to the slower Python implementation if Jython or IronPython. 
 21  try: 
 22      from . import _pwm 
23 24 - def _calculate(score_dict, sequence, m, n):
25 """Calculate scores using C code (PRIVATE).""" 26 logodds = [[score_dict[letter][i] for letter in "ACGT"] for i in range(m)] 27 return _pwm.calculate(sequence, logodds)
28 29 except ImportError: 30 if platform.python_implementation() == 'CPython': 31 import warnings 32 from Bio import BiopythonWarning 33 warnings.warn("Using pure-Python as missing Biopython's C code for PWM. " 34 "This can happen if Biopython was installed without NumPy. " 35 "Try re-installing NumPy and then Biopython.", 36 BiopythonWarning)
37 38 - def _calculate(score_dict, sequence, m, n):
39 """Calculate scores using Python code (PRIVATE). 40 41 The C code handles mixed case so Python version must too. 42 """ 43 sequence = sequence.upper() 44 scores = [] 45 for i in range(n - m + 1): 46 score = 0.0 47 for position in range(m): 48 letter = sequence[i + position] 49 try: 50 score += score_dict[letter][position] 51 except KeyError: 52 score = float("nan") 53 break 54 scores.append(score) 55 return scores
56
57 58 -class GenericPositionMatrix(dict):
59
60 - def __init__(self, alphabet, values):
61 self.length = None 62 for letter in alphabet.letters: 63 if self.length is None: 64 self.length = len(values[letter]) 65 elif self.length != len(values[letter]): 66 raise Exception("data has inconsistent lengths") 67 self[letter] = list(values[letter]) 68 self.alphabet = alphabet 69 self._letters = sorted(self.alphabet.letters)
70
71 - def __str__(self):
72 words = ["%6d" % i for i in range(self.length)] 73 line = " " + " ".join(words) 74 lines = [line] 75 for letter in self._letters: 76 words = ["%6.2f" % value for value in self[letter]] 77 line = "%c: " % letter + " ".join(words) 78 lines.append(line) 79 text = "\n".join(lines) + "\n" 80 return text
81
82 - def __getitem__(self, key):
83 if isinstance(key, tuple): 84 if len(key) == 2: 85 key1, key2 = key 86 if isinstance(key1, slice): 87 start1, stop1, stride1 = key1.indices(len(self._letters)) 88 indices1 = range(start1, stop1, stride1) 89 letters1 = [self._letters[i] for i in indices1] 90 dim1 = 2 91 elif isinstance(key1, int): 92 letter1 = self._letters[key1] 93 dim1 = 1 94 elif isinstance(key1, tuple): 95 letters1 = [self._letters[i] for i in key1] 96 dim1 = 2 97 elif isinstance(key1, str): 98 if len(key1) == 1: 99 letter1 = key1 100 dim1 = 1 101 else: 102 raise KeyError(key1) 103 else: 104 raise KeyError("Cannot understand key %s", str(key1)) 105 if isinstance(key2, slice): 106 start2, stop2, stride2 = key2.indices(self.length) 107 indices2 = range(start2, stop2, stride2) 108 dim2 = 2 109 elif isinstance(key2, int): 110 index2 = key2 111 dim2 = 1 112 else: 113 raise KeyError("Cannot understand key %s", str(key2)) 114 if dim1 == 1 and dim2 == 1: 115 return dict.__getitem__(self, letter1)[index2] 116 elif dim1 == 1 and dim2 == 2: 117 values = dict.__getitem__(self, letter1) 118 return tuple(values[index2] for index2 in indices2) 119 elif dim1 == 2 and dim2 == 1: 120 d = {} 121 for letter1 in letters1: 122 d[letter1] = dict.__getitem__(self, letter1)[index2] 123 return d 124 else: 125 d = {} 126 for letter1 in letters1: 127 values = dict.__getitem__(self, letter1) 128 d[letter1] = [values[index2] for index2 in indices2] 129 if sorted(letters1) == self._letters: 130 return self.__class__(self.alphabet, d) 131 else: 132 return d 133 elif len(key) == 1: 134 key = key[0] 135 else: 136 raise KeyError("keys should be 1- or 2-dimensional") 137 if isinstance(key, slice): 138 start, stop, stride = key.indices(len(self._letters)) 139 indices = range(start, stop, stride) 140 letters = [self._letters[i] for i in indices] 141 dim = 2 142 elif isinstance(key, int): 143 letter = self._letters[key] 144 dim = 1 145 elif isinstance(key, tuple): 146 letters = [self._letters[i] for i in key] 147 dim = 2 148 elif isinstance(key, str): 149 if len(key) == 1: 150 letter = key 151 dim = 1 152 else: 153 raise KeyError(key) 154 else: 155 raise KeyError("Cannot understand key %s", str(key)) 156 if dim == 1: 157 return dict.__getitem__(self, letter) 158 elif dim == 2: 159 d = {} 160 for letter in letters: 161 d[letter] = dict.__getitem__(self, letter) 162 return d 163 else: 164 raise RuntimeError("Should not get here")
165 166 @property
167 - def consensus(self):
168 """Returns the consensus sequence.""" 169 sequence = "" 170 for i in range(self.length): 171 try: 172 maximum = float("-inf") 173 except ValueError: 174 # On Python 2.5 or older that was handled in C code, 175 # and failed on Windows XP 32bit 176 maximum = - 1E400 177 for letter in self.alphabet.letters: 178 count = self[letter][i] 179 if count > maximum: 180 maximum = count 181 sequence_letter = letter 182 sequence += sequence_letter 183 return Seq(sequence, self.alphabet)
184 185 @property
186 - def anticonsensus(self):
187 sequence = "" 188 for i in range(self.length): 189 try: 190 minimum = float("inf") 191 except ValueError: 192 # On Python 2.5 or older that was handled in C code, 193 # and failed on Windows XP 32bit 194 minimum = 1E400 195 for letter in self.alphabet.letters: 196 count = self[letter][i] 197 if count < minimum: 198 minimum = count 199 sequence_letter = letter 200 sequence += sequence_letter 201 return Seq(sequence, self.alphabet)
202 203 @property
204 - def degenerate_consensus(self):
205 # Following the rules adapted from 206 # D. R. Cavener: "Comparison of the consensus sequence flanking 207 # translational start sites in Drosophila and vertebrates." 208 # Nucleic Acids Research 15(4): 1353-1361. (1987). 209 # The same rules are used by TRANSFAC. 210 degenerate_nucleotide = { 211 'A': 'A', 212 'C': 'C', 213 'G': 'G', 214 'T': 'T', 215 'AC': 'M', 216 'AG': 'R', 217 'AT': 'W', 218 'CG': 'S', 219 'CT': 'Y', 220 'GT': 'K', 221 'ACG': 'V', 222 'ACT': 'H', 223 'AGT': 'D', 224 'CGT': 'B', 225 'ACGT': 'N', 226 } 227 sequence = "" 228 for i in range(self.length): 229 def get(nucleotide): 230 return self[nucleotide][i]
231 nucleotides = sorted(self, key=get, reverse=True) 232 counts = [self[c][i] for c in nucleotides] 233 # Follow the Cavener rules: 234 if counts[0] >= sum(counts[1:]) and counts[0] >= 2 * counts[1]: 235 key = nucleotides[0] 236 elif 4 * sum(counts[:2]) > 3 * sum(counts): 237 key = "".join(sorted(nucleotides[:2])) 238 elif counts[3] == 0: 239 key = "".join(sorted(nucleotides[:3])) 240 else: 241 key = "ACGT" 242 nucleotide = degenerate_nucleotide.get(key, key) 243 sequence += nucleotide 244 if isinstance(self.alphabet, Alphabet.DNAAlphabet): 245 alpha = IUPAC.ambiguous_dna 246 elif isinstance(self.alphabet, Alphabet.RNAAlphabet): 247 alpha = IUPAC.ambiguous_rna 248 elif isinstance(self.alphabet, Alphabet.ProteinAlphabet): 249 alpha = IUPAC.protein 250 else: 251 raise Exception("Unknown alphabet") 252 return Seq(sequence, alphabet=alpha)
253 254 @property
255 - def gc_content(self):
256 """Compute the fraction GC content.""" 257 alphabet = self.alphabet 258 gc_total = 0.0 259 total = 0.0 260 for i in range(self.length): 261 for letter in alphabet.letters: 262 if letter in 'CG': 263 gc_total += self[letter][i] 264 total += self[letter][i] 265 return gc_total / total
266
267 - def reverse_complement(self):
268 values = {} 269 values["A"] = self["T"][::-1] 270 values["T"] = self["A"][::-1] 271 values["G"] = self["C"][::-1] 272 values["C"] = self["G"][::-1] 273 alphabet = self.alphabet 274 return self.__class__(alphabet, values)
275
276 277 -class FrequencyPositionMatrix(GenericPositionMatrix):
278
279 - def normalize(self, pseudocounts=None):
280 """Create and return a position-weight matrix by normalizing the counts matrix. 281 282 If pseudocounts is None (default), no pseudocounts are added 283 to the counts. 284 285 If pseudocounts is a number, it is added to the counts before 286 calculating the position-weight matrix. 287 288 Alternatively, the pseudocounts can be a dictionary with a key 289 for each letter in the alphabet associated with the motif. 290 """ 291 counts = {} 292 if pseudocounts is None: 293 for letter in self.alphabet.letters: 294 counts[letter] = [0.0] * self.length 295 elif isinstance(pseudocounts, dict): 296 for letter in self.alphabet.letters: 297 counts[letter] = [float(pseudocounts[letter])] * self.length 298 else: 299 for letter in self.alphabet.letters: 300 counts[letter] = [float(pseudocounts)] * self.length 301 for i in range(self.length): 302 for letter in self.alphabet.letters: 303 counts[letter][i] += self[letter][i] 304 # Actual normalization is done in the PositionWeightMatrix initializer 305 return PositionWeightMatrix(self.alphabet, counts)
306
307 308 -class PositionWeightMatrix(GenericPositionMatrix):
309
310 - def __init__(self, alphabet, counts):
311 GenericPositionMatrix.__init__(self, alphabet, counts) 312 for i in range(self.length): 313 total = sum(float(self[letter][i]) for letter in alphabet.letters) 314 for letter in alphabet.letters: 315 self[letter][i] /= total 316 for letter in alphabet.letters: 317 self[letter] = tuple(self[letter])
318
319 - def log_odds(self, background=None):
320 """Returns the Position-Specific Scoring Matrix. 321 322 The Position-Specific Scoring Matrix (PSSM) contains the log-odds 323 scores computed from the probability matrix and the background 324 probabilities. If the background is None, a uniform background 325 distribution is assumed. 326 """ 327 values = {} 328 alphabet = self.alphabet 329 if background is None: 330 background = dict.fromkeys(self._letters, 1.0) 331 else: 332 background = dict(background) 333 total = sum(background.values()) 334 for letter in alphabet.letters: 335 background[letter] /= total 336 values[letter] = [] 337 for i in range(self.length): 338 for letter in alphabet.letters: 339 b = background[letter] 340 if b > 0: 341 p = self[letter][i] 342 if p > 0: 343 logodds = math.log(p / b, 2) 344 else: 345 # TODO - Ensure this has unittest coverage! 346 try: 347 logodds = float("-inf") 348 except ValueError: 349 # On Python 2.5 or older that was handled in C code, 350 # and failed on Windows XP 32bit 351 logodds = - 1E400 352 else: 353 p = self[letter][i] 354 if p > 0: 355 logodds = float("inf") 356 else: 357 logodds = float("nan") 358 values[letter].append(logodds) 359 pssm = PositionSpecificScoringMatrix(alphabet, values) 360 return pssm
361
362 363 -class PositionSpecificScoringMatrix(GenericPositionMatrix):
364
365 - def calculate(self, sequence):
366 """Returns the PWM score for a given sequence for all positions. 367 368 Notes: 369 - the sequence can only be a DNA sequence 370 - the search is performed only on one strand 371 - if the sequence and the motif have the same length, a single 372 number is returned 373 - otherwise, the result is a one-dimensional list or numpy array 374 375 """ 376 # TODO - Code itself tolerates ambiguous bases (as NaN). 377 if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): 378 raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" 379 % self.alphabet) 380 if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): 381 raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" 382 % sequence.alphabet) 383 384 # NOTE: The C code handles mixed case input as this could be large 385 # (e.g. contig or chromosome), so requiring it be all upper or lower 386 # case would impose an overhead to allocate the extra memory. 387 sequence = str(sequence) 388 m = self.length 389 n = len(sequence) 390 391 scores = _calculate(self, sequence, m, n) 392 393 if len(scores) == 1: 394 return scores[0] 395 else: 396 return scores
397
398 - def search(self, sequence, threshold=0.0, both=True):
399 """Find hits with PWM score above given threshold. 400 401 A generator function, returning found hits in the given sequence 402 with the pwm score higher than the threshold. 403 """ 404 sequence = sequence.upper() 405 n = len(sequence) 406 m = self.length 407 if both: 408 rc = self.reverse_complement() 409 for position in range(0, n - m + 1): 410 s = sequence[position:position + m] 411 score = self.calculate(s) 412 if score > threshold: 413 yield (position, score) 414 if both: 415 score = rc.calculate(s) 416 if score > threshold: 417 yield (position - n, score)
418 419 @property
420 - def max(self):
421 """Maximal possible score for this motif. 422 423 returns the score computed for the consensus sequence. 424 """ 425 score = 0.0 426 letters = self._letters 427 for position in range(0, self.length): 428 score += max(self[letter][position] for letter in letters) 429 return score
430 431 @property
432 - def min(self):
433 """Minimal possible score for this motif. 434 435 returns the score computed for the anticonsensus sequence. 436 """ 437 score = 0.0 438 letters = self._letters 439 for position in range(0, self.length): 440 score += min(self[letter][position] for letter in letters) 441 return score
442 443 @property
444 - def gc_content(self):
445 raise Exception("Cannot compute the %GC composition of a PSSM")
446
447 - def mean(self, background=None):
448 """Expected value of the score of a motif.""" 449 if background is None: 450 background = dict.fromkeys(self._letters, 1.0) 451 else: 452 background = dict(background) 453 total = sum(background.values()) 454 for letter in self._letters: 455 background[letter] /= total 456 sx = 0.0 457 for i in range(self.length): 458 for letter in self._letters: 459 logodds = self[letter, i] 460 if math.isnan(logodds): 461 continue 462 if math.isinf(logodds) and logodds < 0: 463 continue 464 b = background[letter] 465 p = b * math.pow(2, logodds) 466 sx += p * logodds 467 return sx
468
469 - def std(self, background=None):
470 """Standard deviation of the score of a motif.""" 471 if background is None: 472 background = dict.fromkeys(self._letters, 1.0) 473 else: 474 background = dict(background) 475 total = sum(background.values()) 476 for letter in self._letters: 477 background[letter] /= total 478 variance = 0.0 479 for i in range(self.length): 480 sx = 0.0 481 sxx = 0.0 482 for letter in self._letters: 483 logodds = self[letter, i] 484 if math.isnan(logodds): 485 continue 486 if math.isinf(logodds) and logodds < 0: 487 continue 488 b = background[letter] 489 p = b * math.pow(2, logodds) 490 sx += p * logodds 491 sxx += p * logodds * logodds 492 sxx -= sx * sx 493 variance += sxx 494 variance = max(variance, 0) # to avoid roundoff problems 495 return math.sqrt(variance)
496
497 - def dist_pearson(self, other):
498 """Return the similarity score based on pearson correlation for the given motif against self. 499 500 We use the Pearson's correlation of the respective probabilities. 501 """ 502 if self.alphabet != other.alphabet: 503 raise ValueError("Cannot compare motifs with different alphabets") 504 505 max_p = -2 506 for offset in range(-self.length + 1, other.length): 507 if offset < 0: 508 p = self.dist_pearson_at(other, -offset) 509 else: # offset>=0 510 p = other.dist_pearson_at(self, offset) 511 if max_p < p: 512 max_p = p 513 max_o = -offset 514 return 1 - max_p, max_o
515
516 - def dist_pearson_at(self, other, offset):
517 letters = self._letters 518 sx = 0.0 # \sum x 519 sy = 0.0 # \sum y 520 sxx = 0.0 # \sum x^2 521 sxy = 0.0 # \sum x \cdot y 522 syy = 0.0 # \sum y^2 523 norm = max(self.length, offset + other.length) * len(letters) 524 for pos in range(min(self.length - offset, other.length)): 525 xi = [self[letter, pos + offset] for letter in letters] 526 yi = [other[letter, pos] for letter in letters] 527 sx += sum(xi) 528 sy += sum(yi) 529 sxx += sum(x * x for x in xi) 530 sxy += sum(x * y for x, y in zip(xi, yi)) 531 syy += sum(y * y for y in yi) 532 sx /= norm 533 sy /= norm 534 sxx /= norm 535 sxy /= norm 536 syy /= norm 537 numerator = sxy - sx * sy 538 denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy)) 539 return numerator / denominator
540
541 - def distribution(self, background=None, precision=10 ** 3):
542 """Calculate the distribution of the scores at the given precision.""" 543 from .thresholds import ScoreDistribution 544 if background is None: 545 background = dict.fromkeys(self._letters, 1.0) 546 else: 547 background = dict(background) 548 total = sum(background.values()) 549 for letter in self._letters: 550 background[letter] /= total 551 return ScoreDistribution(precision=precision, pssm=self, background=background)
552