Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
 19   
 20  import math 
 21   
 22  __docformat__ = "restructuredtext en" 
23 24 25 -def create(instances, alphabet=None):
26 instances = Instances(instances, alphabet) 27 return Motif(instances=instances, alphabet=alphabet)
28
29 30 -def parse(handle, format):
31 """Parses an output file of motif finding programs. 32 33 Currently supported formats (case is ignored): 34 35 - AlignAce: AlignAce output file format 36 - MEME: MEME output file motif 37 - MAST: MAST output file motif 38 - TRANSFAC: TRANSFAC database file format 39 - pfm: JASPAR-style position-frequency matrix 40 - jaspar: JASPAR-style multiple PFM format 41 - sites: JASPAR-style sites file 42 43 As files in the pfm and sites formats contain only a single motif, 44 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 45 for those. 46 47 For example: 48 49 >>> from Bio import motifs 50 >>> with open("Motif/alignace.out") as handle: 51 ... for m in motifs.parse(handle, "AlignAce"): 52 ... print(m.consensus) 53 ... 54 TCTACGATTGAG 55 CTGCAGCTAGCTACGAGTGAG 56 GTGCTCTAAGCATAGTAGGCG 57 GCCACTAGCAGAGCAGGGGGC 58 CGACTCAGAGGTT 59 CCACGCTAAGAGAGGTGCCGGAG 60 GCGCGTCGCTGAGCA 61 GTCCATCGCAAAGCGTGGGGC 62 GGGATCAGAGGGCCG 63 TGGAGGCGGGG 64 GACCAGAGCTTCGCATGGGGG 65 GGCGTGCGTG 66 GCTGGTTGCTGTTCATTAGG 67 GCCGGCGGCAGCTAAAAGGG 68 GAGGCCGGGGAT 69 CGACTCGTGCTTAGAAGG 70 """ 71 format = format.lower() 72 if format == "alignace": 73 from Bio.motifs import alignace 74 record = alignace.read(handle) 75 return record 76 elif format == "meme": 77 from Bio.motifs import meme 78 record = meme.read(handle) 79 return record 80 elif format == "mast": 81 from Bio.motifs import mast 82 record = mast.read(handle) 83 return record 84 elif format == "transfac": 85 from Bio.motifs import transfac 86 record = transfac.read(handle) 87 return record 88 elif format in ('pfm', 'sites', 'jaspar'): 89 from Bio.motifs import jaspar 90 record = jaspar.read(handle, format) 91 return record 92 else: 93 raise ValueError("Unknown format %s" % format)
94
95 96 -def read(handle, format):
97 """Reads a motif from a handle using a specified file-format. 98 99 This supports the same formats as Bio.motifs.parse(), but 100 only for files containing exactly one motif. For example, 101 reading a JASPAR-style pfm file: 102 103 >>> from Bio import motifs 104 >>> with open("motifs/SRF.pfm") as handle: 105 ... m = motifs.read(handle, "pfm") 106 >>> m.consensus 107 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 108 109 Or a single-motif MEME file, 110 111 >>> from Bio import motifs 112 >>> with open("motifs/meme.out") as handle: 113 ... m = motifs.read(handle, "meme") 114 >>> m.consensus 115 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 116 117 If the handle contains no records, or more than one record, 118 an exception is raised: 119 120 >>> from Bio import motifs 121 >>> with open("motifs/alignace.out") as handle: 122 ... motif = motifs.read(handle, "AlignAce") 123 Traceback (most recent call last): 124 ... 125 ValueError: More than one motif found in handle 126 127 If however you want the first motif from a file containing 128 multiple motifs this function would raise an exception (as 129 shown in the example above). Instead use: 130 131 >>> from Bio import motifs 132 >>> with open("motifs/alignace.out") as handle: 133 ... record = motifs.parse(handle, "alignace") 134 >>> motif = record[0] 135 >>> motif.consensus 136 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 137 138 Use the Bio.motifs.parse(handle, format) function if you want 139 to read multiple records from the handle. 140 """ 141 format = format.lower() 142 motifs = parse(handle, format) 143 if len(motifs) == 0: 144 raise ValueError("No motifs found in handle") 145 if len(motifs) > 1: 146 raise ValueError("More than one motif found in handle") 147 motif = motifs[0] 148 return motif
149
150 151 -class Instances(list):
152 """ 153 A class representing instances of sequence motifs. 154 """
155 - def __init__(self, instances=None, alphabet=None):
156 from Bio.Alphabet import IUPAC 157 from Bio.Seq import Seq 158 if instances is None: 159 instances = [] 160 self.length = None 161 for instance in instances: 162 if self.length is None: 163 self.length = len(instance) 164 elif self.length != len(instance): 165 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 166 raise ValueError(message) 167 try: 168 a = instance.alphabet 169 except AttributeError: 170 # The instance is a plain string 171 continue 172 if alphabet is None: 173 alphabet = a 174 elif alphabet != a: 175 raise ValueError("Alphabets are inconsistent") 176 if alphabet is None or alphabet.letters is None: 177 # If we didn't get a meaningful alphabet from the instances, 178 # assume it is DNA. 179 alphabet = IUPAC.unambiguous_dna 180 for instance in instances: 181 if not isinstance(instance, Seq): 182 sequence = str(instance) 183 instance = Seq(sequence, alphabet=alphabet) 184 self.append(instance) 185 self.alphabet = alphabet
186
187 - def __str__(self):
188 text = "" 189 for instance in self: 190 text += str(instance) + "\n" 191 return text
192
193 - def count(self):
194 counts = {} 195 for letter in self.alphabet.letters: 196 counts[letter] = [0] * self.length 197 for instance in self: 198 for position, letter in enumerate(instance): 199 counts[letter][position] += 1 200 return counts
201
202 - def search(self, sequence):
203 """ 204 a generator function, returning found positions of motif instances in a given sequence 205 """ 206 for pos in range(0, len(sequence) - self.length + 1): 207 for instance in self: 208 if str(instance) == str(sequence[pos:pos + self.length]): 209 yield (pos, instance) 210 break # no other instance will fit (we don't want to return multiple hits)
211
212 - def reverse_complement(self):
213 instances = Instances(alphabet=self.alphabet) 214 instances.length = self.length 215 for instance in self: 216 instance = instance.reverse_complement() 217 instances.append(instance) 218 return instances
219
220 221 -class Motif(object):
222 """ 223 A class representing sequence motifs. 224 """
225 - def __init__(self, alphabet=None, instances=None, counts=None):
226 from . import matrix 227 from Bio.Alphabet import IUPAC 228 self.name = "" 229 if counts is not None and instances is not None: 230 raise Exception(ValueError, 231 "Specify either instances or counts, don't specify both") 232 elif counts is not None: 233 if alphabet is None: 234 alphabet = IUPAC.unambiguous_dna 235 self.instances = None 236 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 237 self.length = self.counts.length 238 elif instances is not None: 239 self.instances = instances 240 alphabet = self.instances.alphabet 241 counts = self.instances.count() 242 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 243 self.length = self.counts.length 244 else: 245 self.counts = None 246 self.instances = None 247 self.length = None 248 if alphabet is None: 249 alphabet = IUPAC.unambiguous_dna 250 self.alphabet = alphabet 251 self.pseudocounts = None 252 self.background = None 253 self.mask = None
254
255 - def __get_mask(self):
256 return self.__mask
257
258 - def __set_mask(self, mask):
259 if self.length is None: 260 self.__mask = () 261 elif mask is None: 262 self.__mask = (1,) * self.length 263 elif len(mask) != self.length: 264 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 265 elif isinstance(mask, str): 266 self.__mask = [] 267 for char in mask: 268 if char == "*": 269 self.__mask.append(1) 270 elif char == " ": 271 self.__mask.append(0) 272 else: 273 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'" % char) 274 self.__mask = tuple(self.__mask) 275 else: 276 self.__mask = tuple(int(bool(c)) for c in mask)
277 278 mask = property(__get_mask, __set_mask) 279 del __get_mask 280 del __set_mask 281
282 - def __get_pseudocounts(self):
283 return self._pseudocounts
284
285 - def __set_pseudocounts(self, value):
286 self._pseudocounts = {} 287 if isinstance(value, dict): 288 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 289 else: 290 if value is None: 291 value = 0.0 292 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
293 294 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 295 del __get_pseudocounts 296 del __set_pseudocounts 297
298 - def __get_background(self):
299 return self._background
300
301 - def __set_background(self, value):
302 if isinstance(value, dict): 303 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 304 elif value is None: 305 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 306 else: 307 if sorted(self.alphabet.letters) != ["A", "C", "G", "T"]: 308 raise Exception("Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content") 309 self._background['A'] = (1.0 - value) / 2.0 310 self._background['C'] = value / 2.0 311 self._background['G'] = value / 2.0 312 self._background['T'] = (1.0 - value) / 2.0 313 total = sum(self._background.values()) 314 for letter in self.alphabet.letters: 315 self._background[letter] /= total
316 317 background = property(__get_background, __set_background) 318 del __get_background 319 del __set_background 320 321 @property
322 - def pwm(self):
323 return self.counts.normalize(self._pseudocounts)
324 325 @property
326 - def pssm(self):
327 return self.pwm.log_odds(self._background)
328
329 - def __str__(self, masked=False):
330 """ string representation of a motif. 331 """ 332 text = "" 333 if self.instances is not None: 334 text += str(self.instances) 335 336 if masked: 337 for i in range(self.length): 338 if self.__mask[i]: 339 text += "*" 340 else: 341 text += " " 342 text += "\n" 343 return text
344
345 - def __len__(self):
346 """return the length of a motif 347 348 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 349 """ 350 if self.length is None: 351 return 0 352 else: 353 return self.length
354
355 - def reverse_complement(self):
356 """Gives the reverse complement of the motif.""" 357 alphabet = self.alphabet 358 if self.instances is not None: 359 instances = self.instances.reverse_complement() 360 res = Motif(instances=instances, alphabet=alphabet) 361 else: # has counts 362 res = Motif(alphabet) 363 res.counts = {} 364 res.counts["A"] = self.counts["T"][::-1] 365 res.counts["T"] = self.counts["A"][::-1] 366 res.counts["G"] = self.counts["C"][::-1] 367 res.counts["C"] = self.counts["G"][::-1] 368 res.length = self.length 369 res.__mask = self.__mask[::-1] 370 return res
371 372 @property
373 - def consensus(self):
374 """Returns the consensus sequence.""" 375 return self.counts.consensus
376 377 @property
378 - def anticonsensus(self):
379 """Returns the least probable pattern to be generated from this motif.""" 380 return self.counts.anticonsensus
381 382 @property
383 - def degenerate_consensus(self):
384 """Generate degenerate consesnsus sequence. 385 386 Following the rules adapted from 387 D. R. Cavener: "Comparison of the consensus sequence flanking 388 translational start sites in Drosophila and vertebrates." 389 Nucleic Acids Research 15(4): 1353-1361. (1987). 390 391 The same rules are used by TRANSFAC. 392 """ 393 return self.counts.degenerate_consensus
394
395 - def weblogo(self, fname, format="PNG", version="2.8.2", **kwds):
396 """Uses the Berkeley weblogo service to download and save a weblogo of itself. 397 398 Requires an internet connection. 399 400 The parameters from ``**kwds`` are passed directly to the weblogo server. 401 402 Currently, this method uses WebLogo version 3.3. 403 These are the arguments and their default values passed to 404 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 405 for more information:: 406 407 'stack_width' : 'medium', 408 'stack_per_line' : '40', 409 'alphabet' : 'alphabet_dna', 410 'ignore_lower_case' : True, 411 'unit_name' : "bits", 412 'first_index' : '1', 413 'logo_start' : '1', 414 'logo_end': str(self.length), 415 'composition' : "comp_auto", 416 'percentCG' : '', 417 'scale_width' : True, 418 'show_errorbars' : True, 419 'logo_title' : '', 420 'logo_label' : '', 421 'show_xaxis': True, 422 'xaxis_label': '', 423 'show_yaxis': True, 424 'yaxis_label': '', 425 'yaxis_scale': 'auto', 426 'yaxis_tic_interval' : '1.0', 427 'show_ends' : True, 428 'show_fineprint' : True, 429 'color_scheme': 'color_auto', 430 'symbols0': '', 431 'symbols1': '', 432 'symbols2': '', 433 'symbols3': '', 434 'symbols4': '', 435 'color0': '', 436 'color1': '', 437 'color2': '', 438 'color3': '', 439 'color4': '', 440 441 """ 442 from Bio._py3k import urlopen, urlencode, Request 443 444 frequencies = self.format('transfac') 445 url = 'http://weblogo.threeplusone.com/create.cgi' 446 values = {'sequences': frequencies, 447 'format': format.lower(), 448 'stack_width': 'medium', 449 'stack_per_line': '40', 450 'alphabet': 'alphabet_dna', 451 'ignore_lower_case': True, 452 'unit_name': "bits", 453 'first_index': '1', 454 'logo_start': '1', 455 'logo_end': str(self.length), 456 'composition': "comp_auto", 457 'percentCG': '', 458 'scale_width': True, 459 'show_errorbars': True, 460 'logo_title': '', 461 'logo_label': '', 462 'show_xaxis': True, 463 'xaxis_label': '', 464 'show_yaxis': True, 465 'yaxis_label': '', 466 'yaxis_scale': 'auto', 467 'yaxis_tic_interval': '1.0', 468 'show_ends': True, 469 'show_fineprint': True, 470 'color_scheme': 'color_auto', 471 'symbols0': '', 472 'symbols1': '', 473 'symbols2': '', 474 'symbols3': '', 475 'symbols4': '', 476 'color0': '', 477 'color1': '', 478 'color2': '', 479 'color3': '', 480 'color4': '', 481 } 482 483 values.update( 484 dict((k, "" if v is False else str(v)) for k, v in kwds.items())) 485 data = urlencode(values).encode("utf-8") 486 req = Request(url, data) 487 response = urlopen(req) 488 with open(fname, "wb") as f: 489 im = response.read() 490 f.write(im)
491
492 - def format(self, format):
493 """Returns a string representation of the Motif in a given format 494 495 Currently supported fromats: 496 - pfm : JASPAR single Position Frequency Matrix 497 - jaspar : JASPAR multiple Position Frequency Matrix 498 - transfac : TRANSFAC like files 499 """ 500 501 if format in ('pfm', 'jaspar'): 502 from Bio.motifs import jaspar 503 motifs = [self] 504 return jaspar.write(motifs, format) 505 elif format == "transfac": 506 from Bio.motifs import transfac 507 motifs = [self] 508 return transfac.write(motifs) 509 else: 510 raise ValueError("Unknown format type %s" % format)
511
512 513 -def write(motifs, format):
514 """Returns a string representation of motifs in a given format 515 516 Currently supported formats (case is ignored): 517 - pfm : JASPAR simple single Position Frequency Matrix 518 - jaspar : JASPAR multiple PFM format 519 - transfac : TRANSFAC like files 520 """ 521 522 format = format.lower() 523 if format in ("pfm", "jaspar"): 524 from Bio.motifs import jaspar 525 return jaspar.write(motifs, format) 526 elif format == "transfac": 527 from Bio.motifs import transfac 528 return transfac.write(motifs) 529 else: 530 raise ValueError("Unknown format type %s" % format)
531 532 533 if __name__ == "__main__": 534 from Bio._utils import run_doctest 535 run_doctest(verbose=0) 536