Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
 19   
 20  import math 
21 22 23 -def create(instances, alphabet=None):
24 instances = Instances(instances, alphabet) 25 return Motif(instances=instances, alphabet=alphabet)
26
27 28 -def parse(handle, format):
29 """Parses an output file of motif finding programs. 30 31 Currently supported formats (case is ignored): 32 33 - AlignAce: AlignAce output file format 34 - MEME: MEME output file motif 35 - MAST: MAST output file motif 36 - TRANSFAC: TRANSFAC database file format 37 - pfm: JASPAR-style position-frequency matrix 38 - jaspar: JASPAR-style multiple PFM format 39 - sites: JASPAR-style sites file 40 41 As files in the pfm and sites formats contain only a single motif, 42 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 43 for those. 44 45 For example: 46 47 >>> from Bio import motifs 48 >>> with open("Motif/alignace.out") as handle: 49 ... for m in motifs.parse(handle, "AlignAce"): 50 ... print(m.consensus) 51 ... 52 TCTACGATTGAG 53 CTGCAGCTAGCTACGAGTGAG 54 GTGCTCTAAGCATAGTAGGCG 55 GCCACTAGCAGAGCAGGGGGC 56 CGACTCAGAGGTT 57 CCACGCTAAGAGAGGTGCCGGAG 58 GCGCGTCGCTGAGCA 59 GTCCATCGCAAAGCGTGGGGC 60 GGGATCAGAGGGCCG 61 TGGAGGCGGGG 62 GACCAGAGCTTCGCATGGGGG 63 GGCGTGCGTG 64 GCTGGTTGCTGTTCATTAGG 65 GCCGGCGGCAGCTAAAAGGG 66 GAGGCCGGGGAT 67 CGACTCGTGCTTAGAAGG 68 """ 69 format = format.lower() 70 if format == "alignace": 71 from Bio.motifs import alignace 72 record = alignace.read(handle) 73 return record 74 elif format == "meme": 75 from Bio.motifs import meme 76 record = meme.read(handle) 77 return record 78 elif format == "mast": 79 from Bio.motifs import mast 80 record = mast.read(handle) 81 return record 82 elif format == "transfac": 83 from Bio.motifs import transfac 84 record = transfac.read(handle) 85 return record 86 elif format in ('pfm', 'sites', 'jaspar'): 87 from Bio.motifs import jaspar 88 record = jaspar.read(handle, format) 89 return record 90 else: 91 raise ValueError("Unknown format %s" % format)
92
93 94 -def read(handle, format):
95 """Reads a motif from a handle using a specified file-format. 96 97 This supports the same formats as Bio.motifs.parse(), but 98 only for files containing exactly one motif. For example, 99 reading a JASPAR-style pfm file: 100 101 >>> from Bio import motifs 102 >>> with open("motifs/SRF.pfm") as handle: 103 ... m = motifs.read(handle, "pfm") 104 >>> m.consensus 105 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 106 107 Or a single-motif MEME file, 108 109 >>> from Bio import motifs 110 >>> with open("motifs/meme.out") as handle: 111 ... m = motifs.read(handle, "meme") 112 >>> m.consensus 113 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 114 115 If the handle contains no records, or more than one record, 116 an exception is raised: 117 118 >>> from Bio import motifs 119 >>> with open("motifs/alignace.out") as handle: 120 ... motif = motifs.read(handle, "AlignAce") 121 Traceback (most recent call last): 122 ... 123 ValueError: More than one motif found in handle 124 125 If however you want the first motif from a file containing 126 multiple motifs this function would raise an exception (as 127 shown in the example above). Instead use: 128 129 >>> from Bio import motifs 130 >>> with open("motifs/alignace.out") as handle: 131 ... record = motifs.parse(handle, "alignace") 132 >>> motif = record[0] 133 >>> motif.consensus 134 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 135 136 Use the Bio.motifs.parse(handle, format) function if you want 137 to read multiple records from the handle. 138 """ 139 format = format.lower() 140 motifs = parse(handle, format) 141 if len(motifs) == 0: 142 raise ValueError("No motifs found in handle") 143 if len(motifs) > 1: 144 raise ValueError("More than one motif found in handle") 145 motif = motifs[0] 146 return motif
147
148 149 -class Instances(list):
150 """ 151 A class representing instances of sequence motifs. 152 """
153 - def __init__(self, instances=None, alphabet=None):
154 from Bio.Alphabet import IUPAC 155 from Bio.Seq import Seq 156 if instances is None: 157 instances = [] 158 self.length = None 159 for instance in instances: 160 if self.length is None: 161 self.length = len(instance) 162 elif self.length != len(instance): 163 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 164 raise ValueError(message) 165 try: 166 a = instance.alphabet 167 except AttributeError: 168 # The instance is a plain string 169 continue 170 if alphabet is None: 171 alphabet = a 172 elif alphabet != a: 173 raise ValueError("Alphabets are inconsistent") 174 if alphabet is None or alphabet.letters is None: 175 # If we didn't get a meaningful alphabet from the instances, 176 # assume it is DNA. 177 alphabet = IUPAC.unambiguous_dna 178 for instance in instances: 179 if not isinstance(instance, Seq): 180 sequence = str(instance) 181 instance = Seq(sequence, alphabet=alphabet) 182 self.append(instance) 183 self.alphabet = alphabet
184
185 - def __str__(self):
186 text = "" 187 for instance in self: 188 text += str(instance) + "\n" 189 return text
190
191 - def count(self):
192 counts = {} 193 for letter in self.alphabet.letters: 194 counts[letter] = [0] * self.length 195 for instance in self: 196 for position, letter in enumerate(instance): 197 counts[letter][position] += 1 198 return counts
199
200 - def search(self, sequence):
201 """ 202 a generator function, returning found positions of motif instances in a given sequence 203 """ 204 for pos in range(0, len(sequence) - self.length + 1): 205 for instance in self: 206 if str(instance) == str(sequence[pos:pos + self.length]): 207 yield (pos, instance) 208 break # no other instance will fit (we don't want to return multiple hits)
209
210 - def reverse_complement(self):
211 instances = Instances(alphabet=self.alphabet) 212 instances.length = self.length 213 for instance in self: 214 instance = instance.reverse_complement() 215 instances.append(instance) 216 return instances
217
218 219 -class Motif(object):
220 """ 221 A class representing sequence motifs. 222 """
223 - def __init__(self, alphabet=None, instances=None, counts=None):
224 from . import matrix 225 from Bio.Alphabet import IUPAC 226 self.name = "" 227 if counts is not None and instances is not None: 228 raise Exception(ValueError, 229 "Specify either instances or counts, don't specify both") 230 elif counts is not None: 231 if alphabet is None: 232 alphabet = IUPAC.unambiguous_dna 233 self.instances = None 234 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 235 self.length = self.counts.length 236 elif instances is not None: 237 self.instances = instances 238 alphabet = self.instances.alphabet 239 counts = self.instances.count() 240 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 241 self.length = self.counts.length 242 else: 243 self.counts = None 244 self.instances = None 245 self.length = None 246 if alphabet is None: 247 alphabet = IUPAC.unambiguous_dna 248 self.alphabet = alphabet 249 self.pseudocounts = None 250 self.background = None 251 self.mask = None
252
253 - def __get_mask(self):
254 return self.__mask
255
256 - def __set_mask(self, mask):
257 if self.length is None: 258 self.__mask = () 259 elif mask is None: 260 self.__mask = (1,) * self.length 261 elif len(mask) != self.length: 262 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 263 elif isinstance(mask, str): 264 self.__mask = [] 265 for char in mask: 266 if char == "*": 267 self.__mask.append(1) 268 elif char == " ": 269 self.__mask.append(0) 270 else: 271 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'" % char) 272 self.__mask = tuple(self.__mask) 273 else: 274 self.__mask = tuple(int(bool(c)) for c in mask)
275 276 mask = property(__get_mask, __set_mask) 277 del __get_mask 278 del __set_mask 279
280 - def __get_pseudocounts(self):
281 return self._pseudocounts
282
283 - def __set_pseudocounts(self, value):
284 self._pseudocounts = {} 285 if isinstance(value, dict): 286 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 287 else: 288 if value is None: 289 value = 0.0 290 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
291 292 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 293 del __get_pseudocounts 294 del __set_pseudocounts 295
296 - def __get_background(self):
297 return self._background
298
299 - def __set_background(self, value):
300 if isinstance(value, dict): 301 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 302 elif value is None: 303 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 304 else: 305 if sorted(self.alphabet.letters) != ["A", "C", "G", "T"]: 306 # TODO - Should this be a ValueError? 307 raise Exception("Setting the background to a single value only " 308 "works for DNA motifs (in which case the value " 309 "is interpreted as the GC content") 310 self._background['A'] = (1.0 - value) / 2.0 311 self._background['C'] = value / 2.0 312 self._background['G'] = value / 2.0 313 self._background['T'] = (1.0 - value) / 2.0 314 total = sum(self._background.values()) 315 for letter in self.alphabet.letters: 316 self._background[letter] /= total
317 318 background = property(__get_background, __set_background) 319 del __get_background 320 del __set_background 321 322 @property
323 - def pwm(self):
324 return self.counts.normalize(self._pseudocounts)
325 326 @property
327 - def pssm(self):
328 return self.pwm.log_odds(self._background)
329
330 - def __str__(self, masked=False):
331 """ string representation of a motif. 332 """ 333 text = "" 334 if self.instances is not None: 335 text += str(self.instances) 336 337 if masked: 338 for i in range(self.length): 339 if self.__mask[i]: 340 text += "*" 341 else: 342 text += " " 343 text += "\n" 344 return text
345
346 - def __len__(self):
347 """return the length of a motif 348 349 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 350 """ 351 if self.length is None: 352 return 0 353 else: 354 return self.length
355
356 - def reverse_complement(self):
357 """Gives the reverse complement of the motif.""" 358 alphabet = self.alphabet 359 if self.instances is not None: 360 instances = self.instances.reverse_complement() 361 res = Motif(instances=instances, alphabet=alphabet) 362 else: # has counts 363 res = Motif(alphabet) 364 res.counts = {} 365 res.counts["A"] = self.counts["T"][::-1] 366 res.counts["T"] = self.counts["A"][::-1] 367 res.counts["G"] = self.counts["C"][::-1] 368 res.counts["C"] = self.counts["G"][::-1] 369 res.length = self.length 370 res.__mask = self.__mask[::-1] 371 return res
372 373 @property
374 - def consensus(self):
375 """Returns the consensus sequence.""" 376 return self.counts.consensus
377 378 @property
379 - def anticonsensus(self):
380 """Returns the least probable pattern to be generated from this motif.""" 381 return self.counts.anticonsensus
382 383 @property
384 - def degenerate_consensus(self):
385 """Generate degenerate consesnsus sequence. 386 387 Following the rules adapted from 388 D. R. Cavener: "Comparison of the consensus sequence flanking 389 translational start sites in Drosophila and vertebrates." 390 Nucleic Acids Research 15(4): 1353-1361. (1987). 391 392 The same rules are used by TRANSFAC. 393 """ 394 return self.counts.degenerate_consensus
395
396 - def weblogo(self, fname, format="PNG", version="2.8.2", **kwds):
397 """Uses the Berkeley weblogo service to download and save a weblogo of itself. 398 399 Requires an internet connection. 400 401 The parameters from ``**kwds`` are passed directly to the weblogo server. 402 403 Currently, this method uses WebLogo version 3.3. 404 These are the arguments and their default values passed to 405 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 406 for more information:: 407 408 'stack_width' : 'medium', 409 'stack_per_line' : '40', 410 'alphabet' : 'alphabet_dna', 411 'ignore_lower_case' : True, 412 'unit_name' : "bits", 413 'first_index' : '1', 414 'logo_start' : '1', 415 'logo_end': str(self.length), 416 'composition' : "comp_auto", 417 'percentCG' : '', 418 'scale_width' : True, 419 'show_errorbars' : True, 420 'logo_title' : '', 421 'logo_label' : '', 422 'show_xaxis': True, 423 'xaxis_label': '', 424 'show_yaxis': True, 425 'yaxis_label': '', 426 'yaxis_scale': 'auto', 427 'yaxis_tic_interval' : '1.0', 428 'show_ends' : True, 429 'show_fineprint' : True, 430 'color_scheme': 'color_auto', 431 'symbols0': '', 432 'symbols1': '', 433 'symbols2': '', 434 'symbols3': '', 435 'symbols4': '', 436 'color0': '', 437 'color1': '', 438 'color2': '', 439 'color3': '', 440 'color4': '', 441 442 """ 443 from Bio._py3k import urlopen, urlencode, Request 444 from Bio import Alphabet 445 446 if isinstance(self.alphabet, Alphabet.ProteinAlphabet): 447 alpha = "alphabet_protein" 448 elif isinstance(self.alphabet, Alphabet.RNAAlphabet): 449 alpha = "alphabet_rna" 450 elif isinstance(self.alphabet, Alphabet.DNAAlphabet): 451 alpha = "alphabet_dna" 452 else: 453 alpha = "auto" 454 455 frequencies = self.format('transfac') 456 url = 'http://weblogo.threeplusone.com/create.cgi' 457 values = {'sequences': frequencies, 458 'format': format.lower(), 459 'stack_width': 'medium', 460 'stack_per_line': '40', 461 'alphabet': alpha, 462 'ignore_lower_case': True, 463 'unit_name': "bits", 464 'first_index': '1', 465 'logo_start': '1', 466 'logo_end': str(self.length), 467 'composition': "comp_auto", 468 'percentCG': '', 469 'scale_width': True, 470 'show_errorbars': True, 471 'logo_title': '', 472 'logo_label': '', 473 'show_xaxis': True, 474 'xaxis_label': '', 475 'show_yaxis': True, 476 'yaxis_label': '', 477 'yaxis_scale': 'auto', 478 'yaxis_tic_interval': '1.0', 479 'show_ends': True, 480 'show_fineprint': True, 481 'color_scheme': 'color_auto', 482 'symbols0': '', 483 'symbols1': '', 484 'symbols2': '', 485 'symbols3': '', 486 'symbols4': '', 487 'color0': '', 488 'color1': '', 489 'color2': '', 490 'color3': '', 491 'color4': '', 492 } 493 494 values.update( 495 dict((k, "" if v is False else str(v)) for k, v in kwds.items())) 496 data = urlencode(values).encode("utf-8") 497 req = Request(url, data) 498 response = urlopen(req) 499 with open(fname, "wb") as f: 500 im = response.read() 501 f.write(im)
502
503 - def format(self, format):
504 """Returns a string representation of the Motif in a given format 505 506 Currently supported fromats: 507 - pfm : JASPAR single Position Frequency Matrix 508 - jaspar : JASPAR multiple Position Frequency Matrix 509 - transfac : TRANSFAC like files 510 """ 511 512 if format in ('pfm', 'jaspar'): 513 from Bio.motifs import jaspar 514 motifs = [self] 515 return jaspar.write(motifs, format) 516 elif format == "transfac": 517 from Bio.motifs import transfac 518 motifs = [self] 519 return transfac.write(motifs) 520 else: 521 raise ValueError("Unknown format type %s" % format)
522
523 524 -def write(motifs, format):
525 """Returns a string representation of motifs in a given format 526 527 Currently supported formats (case is ignored): 528 - pfm : JASPAR simple single Position Frequency Matrix 529 - jaspar : JASPAR multiple PFM format 530 - transfac : TRANSFAC like files 531 """ 532 533 format = format.lower() 534 if format in ("pfm", "jaspar"): 535 from Bio.motifs import jaspar 536 return jaspar.write(motifs, format) 537 elif format == "transfac": 538 from Bio.motifs import transfac 539 return transfac.write(motifs) 540 else: 541 raise ValueError("Unknown format type %s" % format)
542 543 544 if __name__ == "__main__": 545 from Bio._utils import run_doctest 546 run_doctest(verbose=0) 547