Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
19 20 21 -def create(instances, alphabet=None):
22 instances = Instances(instances, alphabet) 23 return Motif(instances=instances, alphabet=alphabet)
24
25 26 -def parse(handle, format):
27 """Parses an output file of motif finding programs. 28 29 Currently supported formats (case is ignored): 30 31 - AlignAce: AlignAce output file format 32 - MEME: MEME output file motif 33 - MAST: MAST output file motif 34 - TRANSFAC: TRANSFAC database file format 35 - pfm: JASPAR-style position-frequency matrix 36 - jaspar: JASPAR-style multiple PFM format 37 - sites: JASPAR-style sites file 38 39 As files in the pfm and sites formats contain only a single motif, 40 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 41 for those. 42 43 For example: 44 45 >>> from Bio import motifs 46 >>> with open("Motif/alignace.out") as handle: 47 ... for m in motifs.parse(handle, "AlignAce"): 48 ... print(m.consensus) 49 ... 50 TCTACGATTGAG 51 CTGCAGCTAGCTACGAGTGAG 52 GTGCTCTAAGCATAGTAGGCG 53 GCCACTAGCAGAGCAGGGGGC 54 CGACTCAGAGGTT 55 CCACGCTAAGAGAGGTGCCGGAG 56 GCGCGTCGCTGAGCA 57 GTCCATCGCAAAGCGTGGGGC 58 GGGATCAGAGGGCCG 59 TGGAGGCGGGG 60 GACCAGAGCTTCGCATGGGGG 61 GGCGTGCGTG 62 GCTGGTTGCTGTTCATTAGG 63 GCCGGCGGCAGCTAAAAGGG 64 GAGGCCGGGGAT 65 CGACTCGTGCTTAGAAGG 66 """ 67 format = format.lower() 68 if format == "alignace": 69 from Bio.motifs import alignace 70 record = alignace.read(handle) 71 return record 72 elif format == "meme": 73 from Bio.motifs import meme 74 record = meme.read(handle) 75 return record 76 elif format == "mast": 77 from Bio.motifs import mast 78 record = mast.read(handle) 79 return record 80 elif format == "transfac": 81 from Bio.motifs import transfac 82 record = transfac.read(handle) 83 return record 84 elif format in ('pfm', 'sites', 'jaspar'): 85 from Bio.motifs import jaspar 86 record = jaspar.read(handle, format) 87 return record 88 else: 89 raise ValueError("Unknown format %s" % format)
90
91 92 -def read(handle, format):
93 """Reads a motif from a handle using a specified file-format. 94 95 This supports the same formats as Bio.motifs.parse(), but 96 only for files containing exactly one motif. For example, 97 reading a JASPAR-style pfm file: 98 99 >>> from Bio import motifs 100 >>> with open("motifs/SRF.pfm") as handle: 101 ... m = motifs.read(handle, "pfm") 102 >>> m.consensus 103 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 104 105 Or a single-motif MEME file, 106 107 >>> from Bio import motifs 108 >>> with open("motifs/meme.out") as handle: 109 ... m = motifs.read(handle, "meme") 110 >>> m.consensus 111 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 112 113 If the handle contains no records, or more than one record, 114 an exception is raised: 115 116 >>> from Bio import motifs 117 >>> with open("motifs/alignace.out") as handle: 118 ... motif = motifs.read(handle, "AlignAce") 119 Traceback (most recent call last): 120 ... 121 ValueError: More than one motif found in handle 122 123 If however you want the first motif from a file containing 124 multiple motifs this function would raise an exception (as 125 shown in the example above). Instead use: 126 127 >>> from Bio import motifs 128 >>> with open("motifs/alignace.out") as handle: 129 ... record = motifs.parse(handle, "alignace") 130 >>> motif = record[0] 131 >>> motif.consensus 132 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 133 134 Use the Bio.motifs.parse(handle, format) function if you want 135 to read multiple records from the handle. 136 """ 137 format = format.lower() 138 motifs = parse(handle, format) 139 if len(motifs) == 0: 140 raise ValueError("No motifs found in handle") 141 if len(motifs) > 1: 142 raise ValueError("More than one motif found in handle") 143 motif = motifs[0] 144 return motif
145
146 147 -class Instances(list):
148 """ 149 A class representing instances of sequence motifs. 150 """
151 - def __init__(self, instances=None, alphabet=None):
152 from Bio.Alphabet import IUPAC 153 from Bio.Seq import Seq 154 if instances is None: 155 instances = [] 156 self.length = None 157 for instance in instances: 158 if self.length is None: 159 self.length = len(instance) 160 elif self.length != len(instance): 161 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 162 raise ValueError(message) 163 try: 164 a = instance.alphabet 165 except AttributeError: 166 # The instance is a plain string 167 continue 168 if alphabet is None: 169 alphabet = a 170 elif alphabet != a: 171 raise ValueError("Alphabets are inconsistent") 172 if alphabet is None or alphabet.letters is None: 173 # If we didn't get a meaningful alphabet from the instances, 174 # assume it is DNA. 175 alphabet = IUPAC.unambiguous_dna 176 for instance in instances: 177 if not isinstance(instance, Seq): 178 sequence = str(instance) 179 instance = Seq(sequence, alphabet=alphabet) 180 self.append(instance) 181 self.alphabet = alphabet
182
183 - def __str__(self):
184 text = "" 185 for instance in self: 186 text += str(instance) + "\n" 187 return text
188
189 - def count(self):
190 counts = {} 191 for letter in self.alphabet.letters: 192 counts[letter] = [0] * self.length 193 for instance in self: 194 for position, letter in enumerate(instance): 195 counts[letter][position] += 1 196 return counts
197
198 - def search(self, sequence):
199 """ 200 a generator function, returning found positions of motif instances in a given sequence 201 """ 202 for pos in range(0, len(sequence) - self.length + 1): 203 for instance in self: 204 if str(instance) == str(sequence[pos:pos + self.length]): 205 yield (pos, instance) 206 break # no other instance will fit (we don't want to return multiple hits)
207
208 - def reverse_complement(self):
209 instances = Instances(alphabet=self.alphabet) 210 instances.length = self.length 211 for instance in self: 212 instance = instance.reverse_complement() 213 instances.append(instance) 214 return instances
215
216 217 -class Motif(object):
218 """ 219 A class representing sequence motifs. 220 """
221 - def __init__(self, alphabet=None, instances=None, counts=None):
222 from . import matrix 223 from Bio.Alphabet import IUPAC 224 self.name = "" 225 if counts is not None and instances is not None: 226 raise Exception(ValueError, 227 "Specify either instances or counts, don't specify both") 228 elif counts is not None: 229 if alphabet is None: 230 alphabet = IUPAC.unambiguous_dna 231 self.instances = None 232 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 233 self.length = self.counts.length 234 elif instances is not None: 235 self.instances = instances 236 alphabet = self.instances.alphabet 237 counts = self.instances.count() 238 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 239 self.length = self.counts.length 240 else: 241 self.counts = None 242 self.instances = None 243 self.length = None 244 if alphabet is None: 245 alphabet = IUPAC.unambiguous_dna 246 self.alphabet = alphabet 247 self.pseudocounts = None 248 self.background = None 249 self.mask = None
250
251 - def __get_mask(self):
252 return self.__mask
253
254 - def __set_mask(self, mask):
255 if self.length is None: 256 self.__mask = () 257 elif mask is None: 258 self.__mask = (1,) * self.length 259 elif len(mask) != self.length: 260 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 261 elif isinstance(mask, str): 262 self.__mask = [] 263 for char in mask: 264 if char == "*": 265 self.__mask.append(1) 266 elif char == " ": 267 self.__mask.append(0) 268 else: 269 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'" % char) 270 self.__mask = tuple(self.__mask) 271 else: 272 self.__mask = tuple(int(bool(c)) for c in mask)
273 274 mask = property(__get_mask, __set_mask) 275 del __get_mask 276 del __set_mask 277
278 - def __get_pseudocounts(self):
279 return self._pseudocounts
280
281 - def __set_pseudocounts(self, value):
282 self._pseudocounts = {} 283 if isinstance(value, dict): 284 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 285 else: 286 if value is None: 287 value = 0.0 288 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
289 290 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 291 del __get_pseudocounts 292 del __set_pseudocounts 293
294 - def __get_background(self):
295 return self._background
296
297 - def __set_background(self, value):
298 if isinstance(value, dict): 299 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 300 elif value is None: 301 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 302 else: 303 if sorted(self.alphabet.letters) != ["A", "C", "G", "T"]: 304 # TODO - Should this be a ValueError? 305 raise Exception("Setting the background to a single value only " 306 "works for DNA motifs (in which case the value " 307 "is interpreted as the GC content") 308 self._background['A'] = (1.0 - value) / 2.0 309 self._background['C'] = value / 2.0 310 self._background['G'] = value / 2.0 311 self._background['T'] = (1.0 - value) / 2.0 312 total = sum(self._background.values()) 313 for letter in self.alphabet.letters: 314 self._background[letter] /= total
315 316 background = property(__get_background, __set_background) 317 del __get_background 318 del __set_background 319 320 @property
321 - def pwm(self):
322 return self.counts.normalize(self._pseudocounts)
323 324 @property
325 - def pssm(self):
326 return self.pwm.log_odds(self._background)
327
328 - def __str__(self, masked=False):
329 """ string representation of a motif. 330 """ 331 text = "" 332 if self.instances is not None: 333 text += str(self.instances) 334 335 if masked: 336 for i in range(self.length): 337 if self.__mask[i]: 338 text += "*" 339 else: 340 text += " " 341 text += "\n" 342 return text
343
344 - def __len__(self):
345 """return the length of a motif 346 347 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 348 """ 349 if self.length is None: 350 return 0 351 else: 352 return self.length
353
354 - def reverse_complement(self):
355 """Gives the reverse complement of the motif.""" 356 alphabet = self.alphabet 357 if self.instances is not None: 358 instances = self.instances.reverse_complement() 359 res = Motif(instances=instances, alphabet=alphabet) 360 else: # has counts 361 res = Motif(alphabet) 362 res.counts = {} 363 res.counts["A"] = self.counts["T"][::-1] 364 res.counts["T"] = self.counts["A"][::-1] 365 res.counts["G"] = self.counts["C"][::-1] 366 res.counts["C"] = self.counts["G"][::-1] 367 res.length = self.length 368 res.__mask = self.__mask[::-1] 369 return res
370 371 @property
372 - def consensus(self):
373 """Returns the consensus sequence.""" 374 return self.counts.consensus
375 376 @property
377 - def anticonsensus(self):
378 """Returns the least probable pattern to be generated from this motif.""" 379 return self.counts.anticonsensus
380 381 @property
382 - def degenerate_consensus(self):
383 """Generate degenerate consesnsus sequence. 384 385 Following the rules adapted from 386 D. R. Cavener: "Comparison of the consensus sequence flanking 387 translational start sites in Drosophila and vertebrates." 388 Nucleic Acids Research 15(4): 1353-1361. (1987). 389 390 The same rules are used by TRANSFAC. 391 """ 392 return self.counts.degenerate_consensus
393
394 - def weblogo(self, fname, format="PNG", version="2.8.2", **kwds):
395 """Uses the Berkeley weblogo service to download and save a weblogo of itself. 396 397 Requires an internet connection. 398 399 The parameters from ``**kwds`` are passed directly to the weblogo server. 400 401 Currently, this method uses WebLogo version 3.3. 402 These are the arguments and their default values passed to 403 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 404 for more information:: 405 406 'stack_width' : 'medium', 407 'stack_per_line' : '40', 408 'alphabet' : 'alphabet_dna', 409 'ignore_lower_case' : True, 410 'unit_name' : "bits", 411 'first_index' : '1', 412 'logo_start' : '1', 413 'logo_end': str(self.length), 414 'composition' : "comp_auto", 415 'percentCG' : '', 416 'scale_width' : True, 417 'show_errorbars' : True, 418 'logo_title' : '', 419 'logo_label' : '', 420 'show_xaxis': True, 421 'xaxis_label': '', 422 'show_yaxis': True, 423 'yaxis_label': '', 424 'yaxis_scale': 'auto', 425 'yaxis_tic_interval' : '1.0', 426 'show_ends' : True, 427 'show_fineprint' : True, 428 'color_scheme': 'color_auto', 429 'symbols0': '', 430 'symbols1': '', 431 'symbols2': '', 432 'symbols3': '', 433 'symbols4': '', 434 'color0': '', 435 'color1': '', 436 'color2': '', 437 'color3': '', 438 'color4': '', 439 440 """ 441 from Bio._py3k import urlopen, urlencode, Request 442 from Bio import Alphabet 443 444 if isinstance(self.alphabet, Alphabet.ProteinAlphabet): 445 alpha = "alphabet_protein" 446 elif isinstance(self.alphabet, Alphabet.RNAAlphabet): 447 alpha = "alphabet_rna" 448 elif isinstance(self.alphabet, Alphabet.DNAAlphabet): 449 alpha = "alphabet_dna" 450 else: 451 alpha = "auto" 452 453 frequencies = self.format('transfac') 454 url = 'http://weblogo.threeplusone.com/create.cgi' 455 values = {'sequences': frequencies, 456 'format': format.lower(), 457 'stack_width': 'medium', 458 'stack_per_line': '40', 459 'alphabet': alpha, 460 'ignore_lower_case': True, 461 'unit_name': "bits", 462 'first_index': '1', 463 'logo_start': '1', 464 'logo_end': str(self.length), 465 'composition': "comp_auto", 466 'percentCG': '', 467 'scale_width': True, 468 'show_errorbars': True, 469 'logo_title': '', 470 'logo_label': '', 471 'show_xaxis': True, 472 'xaxis_label': '', 473 'show_yaxis': True, 474 'yaxis_label': '', 475 'yaxis_scale': 'auto', 476 'yaxis_tic_interval': '1.0', 477 'show_ends': True, 478 'show_fineprint': True, 479 'color_scheme': 'color_auto', 480 'symbols0': '', 481 'symbols1': '', 482 'symbols2': '', 483 'symbols3': '', 484 'symbols4': '', 485 'color0': '', 486 'color1': '', 487 'color2': '', 488 'color3': '', 489 'color4': '', 490 } 491 492 values.update( 493 dict((k, "" if v is False else str(v)) for k, v in kwds.items())) 494 data = urlencode(values).encode("utf-8") 495 req = Request(url, data) 496 response = urlopen(req) 497 with open(fname, "wb") as f: 498 im = response.read() 499 f.write(im)
500
501 - def format(self, format):
502 """Returns a string representation of the Motif in a given format 503 504 Currently supported fromats: 505 - pfm : JASPAR single Position Frequency Matrix 506 - jaspar : JASPAR multiple Position Frequency Matrix 507 - transfac : TRANSFAC like files 508 """ 509 if format in ('pfm', 'jaspar'): 510 from Bio.motifs import jaspar 511 motifs = [self] 512 return jaspar.write(motifs, format) 513 elif format == "transfac": 514 from Bio.motifs import transfac 515 motifs = [self] 516 return transfac.write(motifs) 517 else: 518 raise ValueError("Unknown format type %s" % format)
519
520 521 -def write(motifs, format):
522 """Returns a string representation of motifs in a given format 523 524 Currently supported formats (case is ignored): 525 - pfm : JASPAR simple single Position Frequency Matrix 526 - jaspar : JASPAR multiple PFM format 527 - transfac : TRANSFAC like files 528 """ 529 format = format.lower() 530 if format in ("pfm", "jaspar"): 531 from Bio.motifs import jaspar 532 return jaspar.write(motifs, format) 533 elif format == "transfac": 534 from Bio.motifs import transfac 535 return transfac.write(motifs) 536 else: 537 raise ValueError("Unknown format type %s" % format)
538 539 540 if __name__ == "__main__": 541 from Bio._utils import run_doctest 542 run_doctest(verbose=0) 543