Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
 19   
 20  import math 
21 22 23 -def create(instances, alphabet=None):
24 instances = Instances(instances, alphabet) 25 return Motif(instances=instances, alphabet=alphabet)
26
27 28 -def parse(handle, format):
29 """Parses an output file of motif finding programs. 30 31 Currently supported formats (case is ignored): 32 - AlignAce: AlignAce output file format 33 - MEME: MEME output file motif 34 - MAST: MAST output file motif 35 - TRANSFAC: TRANSFAC database file format 36 - pfm: JASPAR-style position-frequency matrix 37 - jaspar: JASPAR-style multiple PFM format 38 - sites: JASPAR-style sites file 39 As files in the pfm and sites formats contain only a single motif, 40 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 41 for those. 42 43 For example: 44 45 >>> from Bio import motifs 46 >>> for m in motifs.parse(open("Motif/alignace.out"), "AlignAce"): 47 ... print(m.consensus) 48 TCTACGATTGAG 49 CTGCAGCTAGCTACGAGTGAG 50 GTGCTCTAAGCATAGTAGGCG 51 GCCACTAGCAGAGCAGGGGGC 52 CGACTCAGAGGTT 53 CCACGCTAAGAGAGGTGCCGGAG 54 GCGCGTCGCTGAGCA 55 GTCCATCGCAAAGCGTGGGGC 56 GGGATCAGAGGGCCG 57 TGGAGGCGGGG 58 GACCAGAGCTTCGCATGGGGG 59 GGCGTGCGTG 60 GCTGGTTGCTGTTCATTAGG 61 GCCGGCGGCAGCTAAAAGGG 62 GAGGCCGGGGAT 63 CGACTCGTGCTTAGAAGG 64 """ 65 format = format.lower() 66 if format=="alignace": 67 from Bio.motifs import alignace 68 record = alignace.read(handle) 69 return record 70 elif format=="meme": 71 from Bio.motifs import meme 72 record = meme.read(handle) 73 return record 74 elif format=="mast": 75 from Bio.motifs import mast 76 record = mast.read(handle) 77 return record 78 elif format=="transfac": 79 from Bio.motifs import transfac 80 record = transfac.read(handle) 81 return record 82 elif format in ('pfm', 'sites', 'jaspar'): 83 from Bio.motifs import jaspar 84 record = jaspar.read(handle, format) 85 return record 86 else: 87 raise ValueError("Unknown format %s" % format)
88
89 90 -def read(handle, format):
91 """Reads a motif from a handle using a specified file-format. 92 93 This supports the same formats as Bio.motifs.parse(), but 94 only for files containing exactly one motif. For example, 95 reading a JASPAR-style pfm file: 96 97 >>> from Bio import motifs 98 >>> m = motifs.read(open("motifs/SRF.pfm"), "pfm") 99 >>> m.consensus 100 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 101 102 Or a single-motif MEME file, 103 104 >>> from Bio import motifs 105 >>> m = motifs.read(open("motifs/meme.out"), "meme") 106 >>> m.consensus 107 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 108 109 If the handle contains no records, or more than one record, 110 an exception is raised: 111 112 >>> from Bio import motifs 113 >>> motif = motifs.read(open("motifs/alignace.out"), "AlignAce") 114 Traceback (most recent call last): 115 ... 116 ValueError: More than one motif found in handle 117 118 If however you want the first motif from a file containing 119 multiple motifs this function would raise an exception (as 120 shown in the example above). Instead use: 121 122 >>> from Bio import motifs 123 >>> record = motifs.parse(open("motifs/alignace.out"), "alignace") 124 >>> motif = record[0] 125 >>> motif.consensus 126 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 127 128 Use the Bio.motifs.parse(handle, format) function if you want 129 to read multiple records from the handle. 130 """ 131 format = format.lower() 132 motifs = parse(handle, format) 133 if len(motifs)==0: 134 raise ValueError("No motifs found in handle") 135 if len(motifs) > 1: 136 raise ValueError("More than one motif found in handle") 137 motif = motifs[0] 138 return motif
139
140 141 -class Instances(list):
142 """ 143 A class representing instances of sequence motifs. 144 """
145 - def __init__(self, instances=[], alphabet=None):
146 from Bio.Alphabet import IUPAC 147 from Bio.Seq import Seq 148 self.length = None 149 for instance in instances: 150 if self.length is None: 151 self.length = len(instance) 152 elif self.length != len(instance): 153 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 154 raise ValueError(message) 155 try: 156 a = instance.alphabet 157 except AttributeError: 158 # The instance is a plain string 159 continue 160 if alphabet is None: 161 alphabet = a 162 elif alphabet != a: 163 raise ValueError("Alphabets are inconsistent") 164 if alphabet is None or alphabet.letters is None: 165 # If we didn't get a meaningful alphabet from the instances, 166 # assume it is DNA. 167 alphabet = IUPAC.unambiguous_dna 168 for instance in instances: 169 if not isinstance(instance, Seq): 170 sequence = str(instance) 171 instance = Seq(sequence, alphabet=alphabet) 172 self.append(instance) 173 self.alphabet = alphabet
174
175 - def __str__(self):
176 text = "" 177 for instance in self: 178 text += str(instance) + "\n" 179 return text
180
181 - def count(self):
182 counts = {} 183 for letter in self.alphabet.letters: 184 counts[letter] = [0] * self.length 185 for instance in self: 186 for position, letter in enumerate(instance): 187 counts[letter][position] += 1 188 return counts
189
190 - def search(self, sequence):
191 """ 192 a generator function, returning found positions of motif instances in a given sequence 193 """ 194 for pos in range(0, len(sequence)-self.length+1): 195 for instance in self: 196 if str(instance) == str(sequence[pos:pos+self.length]): 197 yield(pos, instance) 198 break # no other instance will fit (we don't want to return multiple hits)
199 - def reverse_complement(self):
200 instances = Instances(alphabet=self.alphabet) 201 instances.length = self.length 202 for instance in self: 203 instance = instance.reverse_complement() 204 instances.append(instance) 205 return instances
206
207 208 -class Motif(object):
209 """ 210 A class representing sequence motifs. 211 """
212 - def __init__(self, alphabet=None, instances=None, counts=None):
213 from . import matrix 214 from Bio.Alphabet import IUPAC 215 self.name="" 216 if counts is not None and instances is not None: 217 raise Exception(ValueError, 218 "Specify either instances or counts, don't specify both") 219 elif counts is not None: 220 if alphabet is None: 221 alphabet = IUPAC.unambiguous_dna 222 self.instances = None 223 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 224 self.length = self.counts.length 225 elif instances is not None: 226 self.instances = instances 227 alphabet = self.instances.alphabet 228 counts = self.instances.count() 229 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 230 self.length = self.counts.length 231 else: 232 self.counts = None 233 self.instances = None 234 self.length = None 235 if alphabet is None: 236 alphabet = IUPAC.unambiguous_dna 237 self.alphabet = alphabet 238 self.pseudocounts = None 239 self.background = None 240 self.mask = None
241
242 - def __get_mask(self):
243 return self.__mask
244
245 - def __set_mask(self, mask):
246 if self.length is None: 247 self.__mask = () 248 elif mask is None: 249 self.__mask = (1,) * self.length 250 elif len(mask)!=self.length: 251 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 252 elif isinstance(mask, str): 253 self.__mask=[] 254 for char in mask: 255 if char=="*": 256 self.__mask.append(1) 257 elif char==" ": 258 self.__mask.append(0) 259 else: 260 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char) 261 self.__mask = tuple(self.__mask) 262 else: 263 self.__mask = tuple(int(bool(c)) for c in mask)
264 265 mask = property(__get_mask, __set_mask) 266 del __get_mask 267 del __set_mask 268
269 - def __get_pseudocounts(self):
270 return self._pseudocounts
271
272 - def __set_pseudocounts(self, value):
273 self._pseudocounts = {} 274 if isinstance(value, dict): 275 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 276 else: 277 if value is None: 278 value = 0.0 279 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
280 281 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 282 del __get_pseudocounts 283 del __set_pseudocounts 284
285 - def __get_background(self):
286 return self._background
287
288 - def __set_background(self, value):
289 if isinstance(value, dict): 290 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 291 elif value is None: 292 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 293 else: 294 if sorted(self.alphabet.letters)!=["A", "C", "G", "T"]: 295 raise Exception("Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content") 296 self._background['A'] = (1.0-value)/2.0 297 self._background['C'] = value/2.0 298 self._background['G'] = value/2.0 299 self._background['T'] = (1.0-value)/2.0 300 total = sum(self._background.values()) 301 for letter in self.alphabet.letters: 302 self._background[letter] /= total
303 304 background = property(__get_background, __set_background) 305 del __get_background 306 del __set_background 307 308 @property
309 - def pwm(self):
310 return self.counts.normalize(self._pseudocounts)
311 312 @property
313 - def pssm(self):
314 return self.pwm.log_odds(self._background)
315
316 - def __str__(self,masked=False):
317 """ string representation of a motif. 318 """ 319 text = "" 320 if self.instances is not None: 321 text += str(self.instances) 322 323 if masked: 324 for i in range(self.length): 325 if self.__mask[i]: 326 text += "*" 327 else: 328 text += " " 329 text += "\n" 330 return text
331
332 - def __len__(self):
333 """return the length of a motif 334 335 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 336 """ 337 if self.length is None: 338 return 0 339 else: 340 return self.length
341
342 - def reverse_complement(self):
343 """ 344 Gives the reverse complement of the motif 345 """ 346 alphabet = self.alphabet 347 if self.instances is not None: 348 instances = self.instances.reverse_complement() 349 res = Motif(instances=instances, alphabet=alphabet) 350 else: # has counts 351 res = Motif(alphabet) 352 res.counts={} 353 res.counts["A"]=self.counts["T"][::-1] 354 res.counts["T"]=self.counts["A"][::-1] 355 res.counts["G"]=self.counts["C"][::-1] 356 res.counts["C"]=self.counts["G"][::-1] 357 res.length=self.length 358 res.__mask = self.__mask[::-1] 359 return res
360 361 @property
362 - def consensus(self):
363 """Returns the consensus sequence. 364 """ 365 return self.counts.consensus
366 367 @property
368 - def anticonsensus(self):
369 """returns the least probable pattern to be generated from this motif. 370 """ 371 return self.counts.anticonsensus
372 373 @property
374 - def degenerate_consensus(self):
375 """Following the rules adapted from 376 D. R. Cavener: "Comparison of the consensus sequence flanking 377 translational start sites in Drosophila and vertebrates." 378 Nucleic Acids Research 15(4): 1353-1361. (1987). 379 The same rules are used by TRANSFAC.""" 380 return self.counts.degenerate_consensus
381
382 - def weblogo(self,fname,format="PNG",version="2.8.2", **kwds):
383 """ 384 uses the Berkeley weblogo service to download and save a weblogo of 385 itself 386 387 requires an internet connection. 388 The parameters from **kwds are passed directly to the weblogo server. 389 390 Currently, this method uses WebLogo version 3.3. 391 These are the arguments and their default values passed to 392 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 393 for more information: 394 395 'stack_width' : 'medium', 396 'stack_per_line' : '40', 397 'alphabet' : 'alphabet_dna', 398 'ignore_lower_case' : True, 399 'unit_name' : "bits", 400 'first_index' : '1', 401 'logo_start' : '1', 402 'logo_end': str(self.length), 403 'composition' : "comp_auto", 404 'percentCG' : '', 405 'scale_width' : True, 406 'show_errorbars' : True, 407 'logo_title' : '', 408 'logo_label' : '', 409 'show_xaxis': True, 410 'xaxis_label': '', 411 'show_yaxis': True, 412 'yaxis_label': '', 413 'yaxis_scale': 'auto', 414 'yaxis_tic_interval' : '1.0', 415 'show_ends' : True, 416 'show_fineprint' : True, 417 'color_scheme': 'color_auto', 418 'symbols0': '', 419 'symbols1': '', 420 'symbols2': '', 421 'symbols3': '', 422 'symbols4': '', 423 'color0': '', 424 'color1': '', 425 'color2': '', 426 'color3': '', 427 'color4': '', 428 429 """ 430 from Bio._py3k import urlopen, urlencode, Request 431 432 frequencies = self.format('transfac') 433 url = 'http://weblogo.threeplusone.com/create.cgi' 434 values = {'sequences': frequencies, 435 'format': format.lower(), 436 'stack_width': 'medium', 437 'stack_per_line': '40', 438 'alphabet': 'alphabet_dna', 439 'ignore_lower_case': True, 440 'unit_name': "bits", 441 'first_index': '1', 442 'logo_start': '1', 443 'logo_end': str(self.length), 444 'composition': "comp_auto", 445 'percentCG': '', 446 'scale_width': True, 447 'show_errorbars': True, 448 'logo_title': '', 449 'logo_label': '', 450 'show_xaxis': True, 451 'xaxis_label': '', 452 'show_yaxis': True, 453 'yaxis_label': '', 454 'yaxis_scale': 'auto', 455 'yaxis_tic_interval': '1.0', 456 'show_ends': True, 457 'show_fineprint': True, 458 'color_scheme': 'color_auto', 459 'symbols0': '', 460 'symbols1': '', 461 'symbols2': '', 462 'symbols3': '', 463 'symbols4': '', 464 'color0': '', 465 'color1': '', 466 'color2': '', 467 'color3': '', 468 'color4': '', 469 } 470 for k, v in kwds.items(): 471 if isinstance(values[k], bool): 472 if not v: 473 v = "" 474 values[k]=str(v) 475 476 data = urlencode(values) 477 req = Request(url, data) 478 response = urlopen(req) 479 with open(fname,"w") as f: 480 im = response.read() 481 f.write(im)
482
483 - def format(self, format):
484 """Returns a string representation of the Motif in a given format 485 486 Currently supported fromats: 487 - pfm : JASPAR single Position Frequency Matrix 488 - jaspar : JASPAR multiple Position Frequency Matrix 489 - transfac : TRANSFAC like files 490 """ 491 492 if format in ('pfm', 'jaspar'): 493 from Bio.motifs import jaspar 494 motifs = [self] 495 return jaspar.write(motifs, format) 496 elif format=="transfac": 497 from Bio.motifs import transfac 498 motifs = [self] 499 return transfac.write(motifs) 500 else: 501 raise ValueError("Unknown format type %s" % format)
502
503 504 -def write(motifs, format):
505 """Returns a string representation of motifs in a given format 506 507 Currently supported formats (case is ignored): 508 - pfm : JASPAR simple single Position Frequency Matrix 509 - jaspar : JASPAR multiple PFM format 510 - transfac : TRANSFAC like files 511 """ 512 513 format = format.lower() 514 if format in ("pfm", "jaspar"): 515 from Bio.motifs import jaspar 516 return jaspar.write(motifs, format) 517 elif format=="transfac": 518 from Bio.motifs import transfac 519 return transfac.write(motifs) 520 else: 521 raise ValueError("Unknown format type %s" % format)
522