[frames] | no frames]

# Source Code for Module Bio.motifs.matrix

  1  # Copyright 2013 by Michiel de Hoon.  All rights reserved.
2  # This code is part of the Biopython distribution and governed by its
4  # as part of this package.
5  """Implementation of frequency (count) matrices, position-weight matrices,
6  and position-specific scoring matrices.
7  """
8
9  import math
10  import platform
11
12  from Bio._py3k import range
13
14  from Bio.Seq import Seq
15  from Bio.Alphabet import IUPAC
16  from Bio import Alphabet
17
18
19  # Make sure that we use C-accelerated PWM calculations if running under CPython.
20  # Fall back to the slower Python implementation if Jython or IronPython.
21  try:
22      from . import _pwm
23
24 -    def _calculate(score_dict, sequence, m, n):
25          """Calculate scores using C code (PRIVATE)."""
26          logodds = [[score_dict[letter][i] for letter in "ACGT"] for i in range(m)]
27          return _pwm.calculate(sequence, logodds)
28
29  except ImportError:
30      if platform.python_implementation() == 'CPython':
31          import warnings
32          from Bio import BiopythonWarning
33          warnings.warn("Using pure-Python as missing Biopython's C code for PWM. "
34                        "This can happen if Biopython was installed without NumPy. "
35                        "Try re-installing NumPy and then Biopython.",
36                        BiopythonWarning)
37
38 -    def _calculate(score_dict, sequence, m, n):
39          """Calculate scores using Python code (PRIVATE).
40
41          The C code handles mixed case so Python version must too.
42          """
43          sequence = sequence.upper()
44          scores = []
45          for i in range(n - m + 1):
46              score = 0.0
47              for position in range(m):
48                  letter = sequence[i + position]
49                  try:
50                      score += score_dict[letter][position]
51                  except KeyError:
52                      score = float("nan")
53                      break
54              scores.append(score)
55          return scores
56
57
58 -class GenericPositionMatrix(dict):
59
60 -    def __init__(self, alphabet, values):
61          self.length = None
62          for letter in alphabet.letters:
63              if self.length is None:
64                  self.length = len(values[letter])
65              elif self.length != len(values[letter]):
66                  raise Exception("data has inconsistent lengths")
67              self[letter] = list(values[letter])
68          self.alphabet = alphabet
69          self._letters = sorted(self.alphabet.letters)
70
71 -    def __str__(self):
72          words = ["%6d" % i for i in range(self.length)]
73          line = "   " + " ".join(words)
74          lines = [line]
75          for letter in self._letters:
76              words = ["%6.2f" % value for value in self[letter]]
77              line = "%c: " % letter + " ".join(words)
78              lines.append(line)
79          text = "\n".join(lines) + "\n"
80          return text
81
82 -    def __getitem__(self, key):
83          if isinstance(key, tuple):
84              if len(key) == 2:
85                  key1, key2 = key
86                  if isinstance(key1, slice):
87                      start1, stop1, stride1 = key1.indices(len(self._letters))
88                      indices1 = range(start1, stop1, stride1)
89                      letters1 = [self._letters[i] for i in indices1]
90                      dim1 = 2
91                  elif isinstance(key1, int):
92                      letter1 = self._letters[key1]
93                      dim1 = 1
94                  elif isinstance(key1, tuple):
95                      letters1 = [self._letters[i] for i in key1]
96                      dim1 = 2
97                  elif isinstance(key1, str):
98                      if len(key1) == 1:
99                          letter1 = key1
100                          dim1 = 1
101                      else:
102                          raise KeyError(key1)
103                  else:
104                      raise KeyError("Cannot understand key %s", str(key1))
105                  if isinstance(key2, slice):
106                      start2, stop2, stride2 = key2.indices(self.length)
107                      indices2 = range(start2, stop2, stride2)
108                      dim2 = 2
109                  elif isinstance(key2, int):
110                      index2 = key2
111                      dim2 = 1
112                  else:
113                      raise KeyError("Cannot understand key %s", str(key2))
114                  if dim1 == 1 and dim2 == 1:
115                      return dict.__getitem__(self, letter1)[index2]
116                  elif dim1 == 1 and dim2 == 2:
117                      values = dict.__getitem__(self, letter1)
118                      return tuple(values[index2] for index2 in indices2)
119                  elif dim1 == 2 and dim2 == 1:
120                      d = {}
121                      for letter1 in letters1:
122                          d[letter1] = dict.__getitem__(self, letter1)[index2]
123                      return d
124                  else:
125                      d = {}
126                      for letter1 in letters1:
127                          values = dict.__getitem__(self, letter1)
128                          d[letter1] = [values[index2] for index2 in indices2]
129                      if sorted(letters1) == self._letters:
130                          return self.__class__(self.alphabet, d)
131                      else:
132                          return d
133              elif len(key) == 1:
134                  key = key[0]
135              else:
136                  raise KeyError("keys should be 1- or 2-dimensional")
137          if isinstance(key, slice):
138              start, stop, stride = key.indices(len(self._letters))
139              indices = range(start, stop, stride)
140              letters = [self._letters[i] for i in indices]
141              dim = 2
142          elif isinstance(key, int):
143              letter = self._letters[key]
144              dim = 1
145          elif isinstance(key, tuple):
146              letters = [self._letters[i] for i in key]
147              dim = 2
148          elif isinstance(key, str):
149              if len(key) == 1:
150                  letter = key
151                  dim = 1
152              else:
153                  raise KeyError(key)
154          else:
155              raise KeyError("Cannot understand key %s", str(key))
156          if dim == 1:
157              return dict.__getitem__(self, letter)
158          elif dim == 2:
159              d = {}
160              for letter in letters:
161                  d[letter] = dict.__getitem__(self, letter)
162              return d
163          else:
164              raise RuntimeError("Should not get here")
165
166      @property
167 -    def consensus(self):
168          """Returns the consensus sequence."""
169          sequence = ""
170          for i in range(self.length):
171              try:
172                  maximum = float("-inf")
173              except ValueError:
174                  # On Python 2.5 or older that was handled in C code,
175                  # and failed on Windows XP 32bit
176                  maximum = - 1E400
177              for letter in self.alphabet.letters:
178                  count = self[letter][i]
179                  if count > maximum:
180                      maximum = count
181                      sequence_letter = letter
182              sequence += sequence_letter
183          return Seq(sequence, self.alphabet)
184
185      @property
186 -    def anticonsensus(self):
187          sequence = ""
188          for i in range(self.length):
189              try:
190                  minimum = float("inf")
191              except ValueError:
192                  # On Python 2.5 or older that was handled in C code,
193                  # and failed on Windows XP 32bit
194                  minimum = 1E400
195              for letter in self.alphabet.letters:
196                  count = self[letter][i]
197                  if count < minimum:
198                      minimum = count
199                      sequence_letter = letter
200              sequence += sequence_letter
201          return Seq(sequence, self.alphabet)
202
203      @property
204 -    def degenerate_consensus(self):
205          # Following the rules adapted from
206          # D. R. Cavener: "Comparison of the consensus sequence flanking
207          # translational start sites in Drosophila and vertebrates."
208          # Nucleic Acids Research 15(4): 1353-1361. (1987).
209          # The same rules are used by TRANSFAC.
210          degenerate_nucleotide = {
211              'A': 'A',
212              'C': 'C',
213              'G': 'G',
214              'T': 'T',
215              'AC': 'M',
216              'AG': 'R',
217              'AT': 'W',
218              'CG': 'S',
219              'CT': 'Y',
220              'GT': 'K',
221              'ACG': 'V',
222              'ACT': 'H',
223              'AGT': 'D',
224              'CGT': 'B',
225              'ACGT': 'N',
226          }
227          sequence = ""
228          for i in range(self.length):
229              def get(nucleotide):
230                  return self[nucleotide][i]
231              nucleotides = sorted(self, key=get, reverse=True)
232              counts = [self[c][i] for c in nucleotides]
233              # Follow the Cavener rules:
234              if counts[0] >= sum(counts[1:]) and counts[0] >= 2 * counts[1]:
235                  key = nucleotides[0]
236              elif 4 * sum(counts[:2]) > 3 * sum(counts):
237                  key = "".join(sorted(nucleotides[:2]))
238              elif counts[3] == 0:
239                  key = "".join(sorted(nucleotides[:3]))
240              else:
241                  key = "ACGT"
242              nucleotide = degenerate_nucleotide.get(key, key)
243              sequence += nucleotide
244          if isinstance(self.alphabet, Alphabet.DNAAlphabet):
245              alpha = IUPAC.ambiguous_dna
246          elif isinstance(self.alphabet, Alphabet.RNAAlphabet):
247              alpha = IUPAC.ambiguous_rna
248          elif isinstance(self.alphabet, Alphabet.ProteinAlphabet):
249              alpha = IUPAC.protein
250          else:
251              raise Exception("Unknown alphabet")
252          return Seq(sequence, alphabet=alpha)
253
254      @property
255 -    def gc_content(self):
256          """Compute the fraction GC content."""
257          alphabet = self.alphabet
258          gc_total = 0.0
259          total = 0.0
260          for i in range(self.length):
261              for letter in alphabet.letters:
262                  if letter in 'CG':
263                      gc_total += self[letter][i]
264                  total += self[letter][i]
265          return gc_total / total
266
267 -    def reverse_complement(self):
268          values = {}
269          values["A"] = self["T"][::-1]
270          values["T"] = self["A"][::-1]
271          values["G"] = self["C"][::-1]
272          values["C"] = self["G"][::-1]
273          alphabet = self.alphabet
274          return self.__class__(alphabet, values)
275
276
277 -class FrequencyPositionMatrix(GenericPositionMatrix):
278
279 -    def normalize(self, pseudocounts=None):
280          """Create and return a position-weight matrix by normalizing the counts matrix.
281
282          If pseudocounts is None (default), no pseudocounts are added
283          to the counts.
284
285          If pseudocounts is a number, it is added to the counts before
286          calculating the position-weight matrix.
287
288          Alternatively, the pseudocounts can be a dictionary with a key
289          for each letter in the alphabet associated with the motif.
290          """
291          counts = {}
292          if pseudocounts is None:
293              for letter in self.alphabet.letters:
294                  counts[letter] = [0.0] * self.length
295          elif isinstance(pseudocounts, dict):
296              for letter in self.alphabet.letters:
297                  counts[letter] = [float(pseudocounts[letter])] * self.length
298          else:
299              for letter in self.alphabet.letters:
300                  counts[letter] = [float(pseudocounts)] * self.length
301          for i in range(self.length):
302              for letter in self.alphabet.letters:
303                  counts[letter][i] += self[letter][i]
304          # Actual normalization is done in the PositionWeightMatrix initializer
305          return PositionWeightMatrix(self.alphabet, counts)
306
307
308 -class PositionWeightMatrix(GenericPositionMatrix):
309
310 -    def __init__(self, alphabet, counts):
311          GenericPositionMatrix.__init__(self, alphabet, counts)
312          for i in range(self.length):
313              total = sum(float(self[letter][i]) for letter in alphabet.letters)
314              for letter in alphabet.letters:
315                  self[letter][i] /= total
316          for letter in alphabet.letters:
317              self[letter] = tuple(self[letter])
318
319 -    def log_odds(self, background=None):
320          """Returns the Position-Specific Scoring Matrix.
321
322          The Position-Specific Scoring Matrix (PSSM) contains the log-odds
323          scores computed from the probability matrix and the background
324          probabilities. If the background is None, a uniform background
325          distribution is assumed.
326          """
327          values = {}
328          alphabet = self.alphabet
329          if background is None:
330              background = dict.fromkeys(self._letters, 1.0)
331          else:
332              background = dict(background)
333          total = sum(background.values())
334          for letter in alphabet.letters:
335              background[letter] /= total
336              values[letter] = []
337          for i in range(self.length):
338              for letter in alphabet.letters:
339                  b = background[letter]
340                  if b > 0:
341                      p = self[letter][i]
342                      if p > 0:
343                          logodds = math.log(p / b, 2)
344                      else:
345                          # TODO - Ensure this has unittest coverage!
346                          try:
347                              logodds = float("-inf")
348                          except ValueError:
349                              # On Python 2.5 or older that was handled in C code,
350                              # and failed on Windows XP 32bit
351                              logodds = - 1E400
352                  else:
353                      p = self[letter][i]
354                      if p > 0:
355                          logodds = float("inf")
356                      else:
357                          logodds = float("nan")
358                  values[letter].append(logodds)
359          pssm = PositionSpecificScoringMatrix(alphabet, values)
360          return pssm
361
362
363 -class PositionSpecificScoringMatrix(GenericPositionMatrix):
364
365 -    def calculate(self, sequence):
366          """Returns the PWM score for a given sequence for all positions.
367
368          Notes:
369
370           - the sequence can only be a DNA sequence
371           - the search is performed only on one strand
372           - if the sequence and the motif have the same length, a single
373             number is returned
374           - otherwise, the result is a one-dimensional list or numpy array
375          """
376          # TODO - Code itself tolerates ambiguous bases (as NaN).
377          if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA):
378              raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs"
379                                   % self.alphabet)
380          if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA):
381              raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences"
382                                   % sequence.alphabet)
383
384          # NOTE: The C code handles mixed case input as this could be large
385          # (e.g. contig or chromosome), so requiring it be all upper or lower
386          # case would impose an overhead to allocate the extra memory.
387          sequence = str(sequence)
388          m = self.length
389          n = len(sequence)
390
391          scores = _calculate(self, sequence, m, n)
392
393          if len(scores) == 1:
394              return scores[0]
395          else:
396              return scores
397
398 -    def search(self, sequence, threshold=0.0, both=True):
399          """Find hits with PWM score above given threshold.
400
401          A generator function, returning found hits in the given sequence
402          with the pwm score higher than the threshold.
403          """
404          sequence = sequence.upper()
405          n = len(sequence)
406          m = self.length
407          if both:
408              rc = self.reverse_complement()
409          for position in range(0, n - m + 1):
410              s = sequence[position:position + m]
411              score = self.calculate(s)
412              if score > threshold:
413                  yield (position, score)
414              if both:
415                  score = rc.calculate(s)
416                  if score > threshold:
417                      yield (position - n, score)
418
419      @property
420 -    def max(self):
421          """Maximal possible score for this motif.
422
423          returns the score computed for the consensus sequence.
424          """
425          score = 0.0
426          letters = self._letters
427          for position in range(0, self.length):
428              score += max(self[letter][position] for letter in letters)
429          return score
430
431      @property
432 -    def min(self):
433          """Minimal possible score for this motif.
434
435          returns the score computed for the anticonsensus sequence.
436          """
437          score = 0.0
438          letters = self._letters
439          for position in range(0, self.length):
440              score += min(self[letter][position] for letter in letters)
441          return score
442
443      @property
444 -    def gc_content(self):
445          raise Exception("Cannot compute the %GC composition of a PSSM")
446
447 -    def mean(self, background=None):
448          """Expected value of the score of a motif."""
449          if background is None:
450              background = dict.fromkeys(self._letters, 1.0)
451          else:
452              background = dict(background)
453          total = sum(background.values())
454          for letter in self._letters:
455              background[letter] /= total
456          sx = 0.0
457          for i in range(self.length):
458              for letter in self._letters:
459                  logodds = self[letter, i]
460                  if math.isnan(logodds):
461                      continue
462                  if math.isinf(logodds) and logodds < 0:
463                      continue
464                  b = background[letter]
465                  p = b * math.pow(2, logodds)
466                  sx += p * logodds
467          return sx
468
469 -    def std(self, background=None):
470          """Standard deviation of the score of a motif."""
471          if background is None:
472              background = dict.fromkeys(self._letters, 1.0)
473          else:
474              background = dict(background)
475          total = sum(background.values())
476          for letter in self._letters:
477              background[letter] /= total
478          variance = 0.0
479          for i in range(self.length):
480              sx = 0.0
481              sxx = 0.0
482              for letter in self._letters:
483                  logodds = self[letter, i]
484                  if math.isnan(logodds):
485                      continue
486                  if math.isinf(logodds) and logodds < 0:
487                      continue
488                  b = background[letter]
489                  p = b * math.pow(2, logodds)
490                  sx += p * logodds
491                  sxx += p * logodds * logodds
492              sxx -= sx * sx
493              variance += sxx
494          variance = max(variance, 0)  # to avoid roundoff problems
495          return math.sqrt(variance)
496
497 -    def dist_pearson(self, other):
498          """Return the similarity score based on pearson correlation for the given motif against self.
499
500          We use the Pearson's correlation of the respective probabilities.
501          """
502          if self.alphabet != other.alphabet:
503              raise ValueError("Cannot compare motifs with different alphabets")
504
505          max_p = -2
506          for offset in range(-self.length + 1, other.length):
507              if offset < 0:
508                  p = self.dist_pearson_at(other, -offset)
509              else:  # offset>=0
510                  p = other.dist_pearson_at(self, offset)
511              if max_p < p:
512                  max_p = p
513                  max_o = -offset
514          return 1 - max_p, max_o
515
516 -    def dist_pearson_at(self, other, offset):
517          letters = self._letters
518          sx = 0.0   # \sum x
519          sy = 0.0   # \sum y
520          sxx = 0.0  # \sum x^2
521          sxy = 0.0  # \sum x \cdot y
522          syy = 0.0  # \sum y^2
523          norm = max(self.length, offset + other.length) * len(letters)
524          for pos in range(min(self.length - offset, other.length)):
525              xi = [self[letter, pos + offset] for letter in letters]
526              yi = [other[letter, pos] for letter in letters]
527              sx += sum(xi)
528              sy += sum(yi)
529              sxx += sum(x * x for x in xi)
530              sxy += sum(x * y for x, y in zip(xi, yi))
531              syy += sum(y * y for y in yi)
532          sx /= norm
533          sy /= norm
534          sxx /= norm
535          sxy /= norm
536          syy /= norm
537          numerator = sxy - sx * sy
538          denominator = math.sqrt((sxx - sx * sx) * (syy - sy * sy))
539          return numerator / denominator
540
541 -    def distribution(self, background=None, precision=10 ** 3):
542          """calculate the distribution of the scores at the given precision."""
543          from .thresholds import ScoreDistribution
544          if background is None:
545              background = dict.fromkeys(self._letters, 1.0)
546          else:
547              background = dict(background)
548          total = sum(background.values())
549          for letter in self._letters:
550              background[letter] /= total
551          return ScoreDistribution(precision=precision, pssm=self, background=background)
552
<!--
expandto(location.href);
// -->



 Generated by Epydoc 3.0.1 on Thu Apr 6 17:43:53 2017 http://epydoc.sourceforge.net