Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
 19   
 20  import math 
21 22 23 -def create(instances, alphabet=None):
24 instances = Instances(instances, alphabet) 25 return Motif(instances=instances, alphabet=alphabet)
26
27 28 -def parse(handle, format):
29 """Parses an output file of motif finding programs. 30 31 Currently supported formats (case is ignored): 32 33 - AlignAce: AlignAce output file format 34 - MEME: MEME output file motif 35 - MAST: MAST output file motif 36 - TRANSFAC: TRANSFAC database file format 37 - pfm: JASPAR-style position-frequency matrix 38 - jaspar: JASPAR-style multiple PFM format 39 - sites: JASPAR-style sites file 40 41 As files in the pfm and sites formats contain only a single motif, 42 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 43 for those. 44 45 For example: 46 47 >>> from Bio import motifs 48 >>> with open("Motif/alignace.out") as handle: 49 ... for m in motifs.parse(handle, "AlignAce"): 50 ... print(m.consensus) 51 ... 52 TCTACGATTGAG 53 CTGCAGCTAGCTACGAGTGAG 54 GTGCTCTAAGCATAGTAGGCG 55 GCCACTAGCAGAGCAGGGGGC 56 CGACTCAGAGGTT 57 CCACGCTAAGAGAGGTGCCGGAG 58 GCGCGTCGCTGAGCA 59 GTCCATCGCAAAGCGTGGGGC 60 GGGATCAGAGGGCCG 61 TGGAGGCGGGG 62 GACCAGAGCTTCGCATGGGGG 63 GGCGTGCGTG 64 GCTGGTTGCTGTTCATTAGG 65 GCCGGCGGCAGCTAAAAGGG 66 GAGGCCGGGGAT 67 CGACTCGTGCTTAGAAGG 68 """ 69 format = format.lower() 70 if format == "alignace": 71 from Bio.motifs import alignace 72 record = alignace.read(handle) 73 return record 74 elif format == "meme": 75 from Bio.motifs import meme 76 record = meme.read(handle) 77 return record 78 elif format == "mast": 79 from Bio.motifs import mast 80 record = mast.read(handle) 81 return record 82 elif format == "transfac": 83 from Bio.motifs import transfac 84 record = transfac.read(handle) 85 return record 86 elif format in ('pfm', 'sites', 'jaspar'): 87 from Bio.motifs import jaspar 88 record = jaspar.read(handle, format) 89 return record 90 else: 91 raise ValueError("Unknown format %s" % format)
92
93 94 -def read(handle, format):
95 """Reads a motif from a handle using a specified file-format. 96 97 This supports the same formats as Bio.motifs.parse(), but 98 only for files containing exactly one motif. For example, 99 reading a JASPAR-style pfm file: 100 101 >>> from Bio import motifs 102 >>> with open("motifs/SRF.pfm") as handle: 103 ... m = motifs.read(handle, "pfm") 104 >>> m.consensus 105 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 106 107 Or a single-motif MEME file, 108 109 >>> from Bio import motifs 110 >>> with open("motifs/meme.out") as handle: 111 ... m = motifs.read(handle, "meme") 112 >>> m.consensus 113 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 114 115 If the handle contains no records, or more than one record, 116 an exception is raised: 117 118 >>> from Bio import motifs 119 >>> with open("motifs/alignace.out") as handle: 120 ... motif = motifs.read(handle, "AlignAce") 121 Traceback (most recent call last): 122 ... 123 ValueError: More than one motif found in handle 124 125 If however you want the first motif from a file containing 126 multiple motifs this function would raise an exception (as 127 shown in the example above). Instead use: 128 129 >>> from Bio import motifs 130 >>> with open("motifs/alignace.out") as handle: 131 ... record = motifs.parse(handle, "alignace") 132 >>> motif = record[0] 133 >>> motif.consensus 134 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 135 136 Use the Bio.motifs.parse(handle, format) function if you want 137 to read multiple records from the handle. 138 """ 139 format = format.lower() 140 motifs = parse(handle, format) 141 if len(motifs) == 0: 142 raise ValueError("No motifs found in handle") 143 if len(motifs) > 1: 144 raise ValueError("More than one motif found in handle") 145 motif = motifs[0] 146 return motif
147
148 149 -class Instances(list):
150 """ 151 A class representing instances of sequence motifs. 152 """
153 - def __init__(self, instances=None, alphabet=None):
154 from Bio.Alphabet import IUPAC 155 from Bio.Seq import Seq 156 if instances is None: 157 instances = [] 158 self.length = None 159 for instance in instances: 160 if self.length is None: 161 self.length = len(instance) 162 elif self.length != len(instance): 163 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 164 raise ValueError(message) 165 try: 166 a = instance.alphabet 167 except AttributeError: 168 # The instance is a plain string 169 continue 170 if alphabet is None: 171 alphabet = a 172 elif alphabet != a: 173 raise ValueError("Alphabets are inconsistent") 174 if alphabet is None or alphabet.letters is None: 175 # If we didn't get a meaningful alphabet from the instances, 176 # assume it is DNA. 177 alphabet = IUPAC.unambiguous_dna 178 for instance in instances: 179 if not isinstance(instance, Seq): 180 sequence = str(instance) 181 instance = Seq(sequence, alphabet=alphabet) 182 self.append(instance) 183 self.alphabet = alphabet
184
185 - def __str__(self):
186 text = "" 187 for instance in self: 188 text += str(instance) + "\n" 189 return text
190
191 - def count(self):
192 counts = {} 193 for letter in self.alphabet.letters: 194 counts[letter] = [0] * self.length 195 for instance in self: 196 for position, letter in enumerate(instance): 197 counts[letter][position] += 1 198 return counts
199
200 - def search(self, sequence):
201 """ 202 a generator function, returning found positions of motif instances in a given sequence 203 """ 204 for pos in range(0, len(sequence) - self.length + 1): 205 for instance in self: 206 if str(instance) == str(sequence[pos:pos + self.length]): 207 yield (pos, instance) 208 break # no other instance will fit (we don't want to return multiple hits)
209
210 - def reverse_complement(self):
211 instances = Instances(alphabet=self.alphabet) 212 instances.length = self.length 213 for instance in self: 214 instance = instance.reverse_complement() 215 instances.append(instance) 216 return instances
217
218 219 -class Motif(object):
220 """ 221 A class representing sequence motifs. 222 """
223 - def __init__(self, alphabet=None, instances=None, counts=None):
224 from . import matrix 225 from Bio.Alphabet import IUPAC 226 self.name = "" 227 if counts is not None and instances is not None: 228 raise Exception(ValueError, 229 "Specify either instances or counts, don't specify both") 230 elif counts is not None: 231 if alphabet is None: 232 alphabet = IUPAC.unambiguous_dna 233 self.instances = None 234 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 235 self.length = self.counts.length 236 elif instances is not None: 237 self.instances = instances 238 alphabet = self.instances.alphabet 239 counts = self.instances.count() 240 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 241 self.length = self.counts.length 242 else: 243 self.counts = None 244 self.instances = None 245 self.length = None 246 if alphabet is None: 247 alphabet = IUPAC.unambiguous_dna 248 self.alphabet = alphabet 249 self.pseudocounts = None 250 self.background = None 251 self.mask = None
252
253 - def __get_mask(self):
254 return self.__mask
255
256 - def __set_mask(self, mask):
257 if self.length is None: 258 self.__mask = () 259 elif mask is None: 260 self.__mask = (1,) * self.length 261 elif len(mask) != self.length: 262 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 263 elif isinstance(mask, str): 264 self.__mask = [] 265 for char in mask: 266 if char == "*": 267 self.__mask.append(1) 268 elif char == " ": 269 self.__mask.append(0) 270 else: 271 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'" % char) 272 self.__mask = tuple(self.__mask) 273 else: 274 self.__mask = tuple(int(bool(c)) for c in mask)
275 276 mask = property(__get_mask, __set_mask) 277 del __get_mask 278 del __set_mask 279
280 - def __get_pseudocounts(self):
281 return self._pseudocounts
282
283 - def __set_pseudocounts(self, value):
284 self._pseudocounts = {} 285 if isinstance(value, dict): 286 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 287 else: 288 if value is None: 289 value = 0.0 290 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
291 292 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 293 del __get_pseudocounts 294 del __set_pseudocounts 295
296 - def __get_background(self):
297 return self._background
298
299 - def __set_background(self, value):
300 if isinstance(value, dict): 301 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 302 elif value is None: 303 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 304 else: 305 if sorted(self.alphabet.letters) != ["A", "C", "G", "T"]: 306 raise Exception("Setting the background to a single value only works for DNA motifs (in which case the value is interpreted as the GC content") 307 self._background['A'] = (1.0 - value) / 2.0 308 self._background['C'] = value / 2.0 309 self._background['G'] = value / 2.0 310 self._background['T'] = (1.0 - value) / 2.0 311 total = sum(self._background.values()) 312 for letter in self.alphabet.letters: 313 self._background[letter] /= total
314 315 background = property(__get_background, __set_background) 316 del __get_background 317 del __set_background 318 319 @property
320 - def pwm(self):
321 return self.counts.normalize(self._pseudocounts)
322 323 @property
324 - def pssm(self):
325 return self.pwm.log_odds(self._background)
326
327 - def __str__(self, masked=False):
328 """ string representation of a motif. 329 """ 330 text = "" 331 if self.instances is not None: 332 text += str(self.instances) 333 334 if masked: 335 for i in range(self.length): 336 if self.__mask[i]: 337 text += "*" 338 else: 339 text += " " 340 text += "\n" 341 return text
342
343 - def __len__(self):
344 """return the length of a motif 345 346 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 347 """ 348 if self.length is None: 349 return 0 350 else: 351 return self.length
352
353 - def reverse_complement(self):
354 """Gives the reverse complement of the motif.""" 355 alphabet = self.alphabet 356 if self.instances is not None: 357 instances = self.instances.reverse_complement() 358 res = Motif(instances=instances, alphabet=alphabet) 359 else: # has counts 360 res = Motif(alphabet) 361 res.counts = {} 362 res.counts["A"] = self.counts["T"][::-1] 363 res.counts["T"] = self.counts["A"][::-1] 364 res.counts["G"] = self.counts["C"][::-1] 365 res.counts["C"] = self.counts["G"][::-1] 366 res.length = self.length 367 res.__mask = self.__mask[::-1] 368 return res
369 370 @property
371 - def consensus(self):
372 """Returns the consensus sequence.""" 373 return self.counts.consensus
374 375 @property
376 - def anticonsensus(self):
377 """Returns the least probable pattern to be generated from this motif.""" 378 return self.counts.anticonsensus
379 380 @property
381 - def degenerate_consensus(self):
382 """Generate degenerate consesnsus sequence. 383 384 Following the rules adapted from 385 D. R. Cavener: "Comparison of the consensus sequence flanking 386 translational start sites in Drosophila and vertebrates." 387 Nucleic Acids Research 15(4): 1353-1361. (1987). 388 389 The same rules are used by TRANSFAC. 390 """ 391 return self.counts.degenerate_consensus
392
393 - def weblogo(self, fname, format="PNG", version="2.8.2", **kwds):
394 """Uses the Berkeley weblogo service to download and save a weblogo of itself. 395 396 Requires an internet connection. 397 398 The parameters from ``**kwds`` are passed directly to the weblogo server. 399 400 Currently, this method uses WebLogo version 3.3. 401 These are the arguments and their default values passed to 402 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 403 for more information:: 404 405 'stack_width' : 'medium', 406 'stack_per_line' : '40', 407 'alphabet' : 'alphabet_dna', 408 'ignore_lower_case' : True, 409 'unit_name' : "bits", 410 'first_index' : '1', 411 'logo_start' : '1', 412 'logo_end': str(self.length), 413 'composition' : "comp_auto", 414 'percentCG' : '', 415 'scale_width' : True, 416 'show_errorbars' : True, 417 'logo_title' : '', 418 'logo_label' : '', 419 'show_xaxis': True, 420 'xaxis_label': '', 421 'show_yaxis': True, 422 'yaxis_label': '', 423 'yaxis_scale': 'auto', 424 'yaxis_tic_interval' : '1.0', 425 'show_ends' : True, 426 'show_fineprint' : True, 427 'color_scheme': 'color_auto', 428 'symbols0': '', 429 'symbols1': '', 430 'symbols2': '', 431 'symbols3': '', 432 'symbols4': '', 433 'color0': '', 434 'color1': '', 435 'color2': '', 436 'color3': '', 437 'color4': '', 438 439 """ 440 from Bio._py3k import urlopen, urlencode, Request 441 from Bio import Alphabet 442 443 if isinstance(self.alphabet, Alphabet.ProteinAlphabet): 444 alpha = "alphabet_protein" 445 elif isinstance(self.alphabet, Alphabet.RNAAlphabet): 446 alpha = "alphabet_rna" 447 elif isinstance(self.alphabet, Alphabet.DNAAlphabet): 448 alpha = "alphabet_dna" 449 else: 450 alpha = "auto" 451 452 frequencies = self.format('transfac') 453 url = 'http://weblogo.threeplusone.com/create.cgi' 454 values = {'sequences': frequencies, 455 'format': format.lower(), 456 'stack_width': 'medium', 457 'stack_per_line': '40', 458 'alphabet': alpha, 459 'ignore_lower_case': True, 460 'unit_name': "bits", 461 'first_index': '1', 462 'logo_start': '1', 463 'logo_end': str(self.length), 464 'composition': "comp_auto", 465 'percentCG': '', 466 'scale_width': True, 467 'show_errorbars': True, 468 'logo_title': '', 469 'logo_label': '', 470 'show_xaxis': True, 471 'xaxis_label': '', 472 'show_yaxis': True, 473 'yaxis_label': '', 474 'yaxis_scale': 'auto', 475 'yaxis_tic_interval': '1.0', 476 'show_ends': True, 477 'show_fineprint': True, 478 'color_scheme': 'color_auto', 479 'symbols0': '', 480 'symbols1': '', 481 'symbols2': '', 482 'symbols3': '', 483 'symbols4': '', 484 'color0': '', 485 'color1': '', 486 'color2': '', 487 'color3': '', 488 'color4': '', 489 } 490 491 values.update( 492 dict((k, "" if v is False else str(v)) for k, v in kwds.items())) 493 data = urlencode(values).encode("utf-8") 494 req = Request(url, data) 495 response = urlopen(req) 496 with open(fname, "wb") as f: 497 im = response.read() 498 f.write(im)
499
500 - def format(self, format):
501 """Returns a string representation of the Motif in a given format 502 503 Currently supported fromats: 504 - pfm : JASPAR single Position Frequency Matrix 505 - jaspar : JASPAR multiple Position Frequency Matrix 506 - transfac : TRANSFAC like files 507 """ 508 509 if format in ('pfm', 'jaspar'): 510 from Bio.motifs import jaspar 511 motifs = [self] 512 return jaspar.write(motifs, format) 513 elif format == "transfac": 514 from Bio.motifs import transfac 515 motifs = [self] 516 return transfac.write(motifs) 517 else: 518 raise ValueError("Unknown format type %s" % format)
519
520 521 -def write(motifs, format):
522 """Returns a string representation of motifs in a given format 523 524 Currently supported formats (case is ignored): 525 - pfm : JASPAR simple single Position Frequency Matrix 526 - jaspar : JASPAR multiple PFM format 527 - transfac : TRANSFAC like files 528 """ 529 530 format = format.lower() 531 if format in ("pfm", "jaspar"): 532 from Bio.motifs import jaspar 533 return jaspar.write(motifs, format) 534 elif format == "transfac": 535 from Bio.motifs import transfac 536 return transfac.write(motifs) 537 else: 538 raise ValueError("Unknown format type %s" % format)
539 540 541 if __name__ == "__main__": 542 from Bio._utils import run_doctest 543 run_doctest(verbose=0) 544