Package Bio :: Package motifs
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs

  1  # Copyright 2003-2009 by Bartek Wilczynski.  All rights reserved. 
  2  # Copyright 2012-2013 by Michiel JL de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Tools for sequence motif analysis. 
  7   
  8  Bio.motifs contains the core Motif class containing various I/O methods 
  9  as well as methods for motif comparisons and motif searching in sequences. 
 10  It also includes functionality for parsing output from the AlignACE, MEME, 
 11  and MAST programs, as well as files in the TRANSFAC format. 
 12   
 13  Bio.motifs is replacing the older and now obsolete Bio.Motif module. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import range 
19 20 21 -def create(instances, alphabet=None):
22 instances = Instances(instances, alphabet) 23 return Motif(instances=instances, alphabet=alphabet)
24
25 26 -def parse(handle, format):
27 """Parses an output file of motif finding programs. 28 29 Currently supported formats (case is ignored): 30 - AlignAce: AlignAce output file format 31 - MEME: MEME output file motif 32 - MAST: MAST output file motif 33 - TRANSFAC: TRANSFAC database file format 34 - pfm: JASPAR-style position-frequency matrix 35 - jaspar: JASPAR-style multiple PFM format 36 - sites: JASPAR-style sites file 37 38 As files in the pfm and sites formats contain only a single motif, 39 it is easier to use Bio.motifs.read() instead of Bio.motifs.parse() 40 for those. 41 42 For example: 43 44 >>> from Bio import motifs 45 >>> with open("Motif/alignace.out") as handle: 46 ... for m in motifs.parse(handle, "AlignAce"): 47 ... print(m.consensus) 48 ... 49 TCTACGATTGAG 50 CTGCAGCTAGCTACGAGTGAG 51 GTGCTCTAAGCATAGTAGGCG 52 GCCACTAGCAGAGCAGGGGGC 53 CGACTCAGAGGTT 54 CCACGCTAAGAGAGGTGCCGGAG 55 GCGCGTCGCTGAGCA 56 GTCCATCGCAAAGCGTGGGGC 57 GGGATCAGAGGGCCG 58 TGGAGGCGGGG 59 GACCAGAGCTTCGCATGGGGG 60 GGCGTGCGTG 61 GCTGGTTGCTGTTCATTAGG 62 GCCGGCGGCAGCTAAAAGGG 63 GAGGCCGGGGAT 64 CGACTCGTGCTTAGAAGG 65 """ 66 format = format.lower() 67 if format == "alignace": 68 from Bio.motifs import alignace 69 record = alignace.read(handle) 70 return record 71 elif format == "meme": 72 from Bio.motifs import meme 73 record = meme.read(handle) 74 return record 75 elif format == "mast": 76 from Bio.motifs import mast 77 record = mast.read(handle) 78 return record 79 elif format == "transfac": 80 from Bio.motifs import transfac 81 record = transfac.read(handle) 82 return record 83 elif format in ('pfm', 'sites', 'jaspar'): 84 from Bio.motifs import jaspar 85 record = jaspar.read(handle, format) 86 return record 87 else: 88 raise ValueError("Unknown format %s" % format)
89
90 91 -def read(handle, format):
92 """Reads a motif from a handle using a specified file-format. 93 94 This supports the same formats as Bio.motifs.parse(), but 95 only for files containing exactly one motif. For example, 96 reading a JASPAR-style pfm file: 97 98 >>> from Bio import motifs 99 >>> with open("motifs/SRF.pfm") as handle: 100 ... m = motifs.read(handle, "pfm") 101 >>> m.consensus 102 Seq('GCCCATATATGG', IUPACUnambiguousDNA()) 103 104 Or a single-motif MEME file, 105 106 >>> from Bio import motifs 107 >>> with open("motifs/meme.out") as handle: 108 ... m = motifs.read(handle, "meme") 109 >>> m.consensus 110 Seq('CTCAATCGTA', IUPACUnambiguousDNA()) 111 112 If the handle contains no records, or more than one record, 113 an exception is raised: 114 115 >>> from Bio import motifs 116 >>> with open("motifs/alignace.out") as handle: 117 ... motif = motifs.read(handle, "AlignAce") 118 Traceback (most recent call last): 119 ... 120 ValueError: More than one motif found in handle 121 122 If however you want the first motif from a file containing 123 multiple motifs this function would raise an exception (as 124 shown in the example above). Instead use: 125 126 >>> from Bio import motifs 127 >>> with open("motifs/alignace.out") as handle: 128 ... record = motifs.parse(handle, "alignace") 129 >>> motif = record[0] 130 >>> motif.consensus 131 Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) 132 133 Use the Bio.motifs.parse(handle, format) function if you want 134 to read multiple records from the handle. 135 """ 136 format = format.lower() 137 motifs = parse(handle, format) 138 if len(motifs) == 0: 139 raise ValueError("No motifs found in handle") 140 if len(motifs) > 1: 141 raise ValueError("More than one motif found in handle") 142 motif = motifs[0] 143 return motif
144
145 146 -class Instances(list):
147 """A class representing instances of sequence motifs.""" 148
149 - def __init__(self, instances=None, alphabet=None):
150 from Bio.Alphabet import IUPAC 151 from Bio.Seq import Seq 152 if instances is None: 153 instances = [] 154 self.length = None 155 for instance in instances: 156 if self.length is None: 157 self.length = len(instance) 158 elif self.length != len(instance): 159 message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length) 160 raise ValueError(message) 161 try: 162 a = instance.alphabet 163 except AttributeError: 164 # The instance is a plain string 165 continue 166 if alphabet is None: 167 alphabet = a 168 elif alphabet != a: 169 raise ValueError("Alphabets are inconsistent") 170 if alphabet is None or alphabet.letters is None: 171 # If we didn't get a meaningful alphabet from the instances, 172 # assume it is DNA. 173 alphabet = IUPAC.unambiguous_dna 174 for instance in instances: 175 if not isinstance(instance, Seq): 176 sequence = str(instance) 177 instance = Seq(sequence, alphabet=alphabet) 178 self.append(instance) 179 self.alphabet = alphabet
180
181 - def __str__(self):
182 text = "" 183 for instance in self: 184 text += str(instance) + "\n" 185 return text
186
187 - def count(self):
188 counts = {} 189 for letter in self.alphabet.letters: 190 counts[letter] = [0] * self.length 191 for instance in self: 192 for position, letter in enumerate(instance): 193 counts[letter][position] += 1 194 return counts
195
196 - def search(self, sequence):
197 """Find positions of motifs in a given sequence. 198 199 This is a generator function, returning found positions of motif 200 instances in a given sequence. 201 """ 202 for pos in range(0, len(sequence) - self.length + 1): 203 for instance in self: 204 if str(instance) == str(sequence[pos:pos + self.length]): 205 yield (pos, instance) 206 break # no other instance will fit (we don't want to return multiple hits)
207
208 - def reverse_complement(self):
209 instances = Instances(alphabet=self.alphabet) 210 instances.length = self.length 211 for instance in self: 212 instance = instance.reverse_complement() 213 instances.append(instance) 214 return instances
215
216 217 -class Motif(object):
218 """A class representing sequence motifs.""" 219
220 - def __init__(self, alphabet=None, instances=None, counts=None):
221 from . import matrix 222 from Bio.Alphabet import IUPAC 223 self.name = "" 224 if counts is not None and instances is not None: 225 raise Exception(ValueError, 226 "Specify either instances or counts, don't specify both") 227 elif counts is not None: 228 if alphabet is None: 229 alphabet = IUPAC.unambiguous_dna 230 self.instances = None 231 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 232 self.length = self.counts.length 233 elif instances is not None: 234 self.instances = instances 235 alphabet = self.instances.alphabet 236 counts = self.instances.count() 237 self.counts = matrix.FrequencyPositionMatrix(alphabet, counts) 238 self.length = self.counts.length 239 else: 240 self.counts = None 241 self.instances = None 242 self.length = None 243 if alphabet is None: 244 alphabet = IUPAC.unambiguous_dna 245 self.alphabet = alphabet 246 self.pseudocounts = None 247 self.background = None 248 self.mask = None
249
250 - def __get_mask(self):
251 return self.__mask
252
253 - def __set_mask(self, mask):
254 if self.length is None: 255 self.__mask = () 256 elif mask is None: 257 self.__mask = (1,) * self.length 258 elif len(mask) != self.length: 259 raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length)) 260 elif isinstance(mask, str): 261 self.__mask = [] 262 for char in mask: 263 if char == "*": 264 self.__mask.append(1) 265 elif char == " ": 266 self.__mask.append(0) 267 else: 268 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'" % char) 269 self.__mask = tuple(self.__mask) 270 else: 271 self.__mask = tuple(int(bool(c)) for c in mask)
272 273 mask = property(__get_mask, __set_mask) 274 del __get_mask 275 del __set_mask 276
277 - def __get_pseudocounts(self):
278 return self._pseudocounts
279
280 - def __set_pseudocounts(self, value):
281 self._pseudocounts = {} 282 if isinstance(value, dict): 283 self._pseudocounts = dict((letter, value[letter]) for letter in self.alphabet.letters) 284 else: 285 if value is None: 286 value = 0.0 287 self._pseudocounts = dict.fromkeys(self.alphabet.letters, value)
288 289 pseudocounts = property(__get_pseudocounts, __set_pseudocounts) 290 del __get_pseudocounts 291 del __set_pseudocounts 292
293 - def __get_background(self):
294 return self._background
295
296 - def __set_background(self, value):
297 if isinstance(value, dict): 298 self._background = dict((letter, value[letter]) for letter in self.alphabet.letters) 299 elif value is None: 300 self._background = dict.fromkeys(self.alphabet.letters, 1.0) 301 else: 302 if sorted(self.alphabet.letters) != ["A", "C", "G", "T"]: 303 # TODO - Should this be a ValueError? 304 raise Exception("Setting the background to a single value only " 305 "works for DNA motifs (in which case the value " 306 "is interpreted as the GC content") 307 self._background['A'] = (1.0 - value) / 2.0 308 self._background['C'] = value / 2.0 309 self._background['G'] = value / 2.0 310 self._background['T'] = (1.0 - value) / 2.0 311 total = sum(self._background.values()) 312 for letter in self.alphabet.letters: 313 self._background[letter] /= total
314 315 background = property(__get_background, __set_background) 316 del __get_background 317 del __set_background 318 319 @property
320 - def pwm(self):
321 return self.counts.normalize(self._pseudocounts)
322 323 @property
324 - def pssm(self):
325 return self.pwm.log_odds(self._background)
326
327 - def __str__(self, masked=False):
328 """String representation of a motif.""" 329 text = "" 330 if self.instances is not None: 331 text += str(self.instances) 332 333 if masked: 334 for i in range(self.length): 335 if self.__mask[i]: 336 text += "*" 337 else: 338 text += " " 339 text += "\n" 340 return text
341
342 - def __len__(self):
343 """return the length of a motif 344 345 Please use this method (i.e. invoke len(m)) instead of referring to m.length directly. 346 """ 347 if self.length is None: 348 return 0 349 else: 350 return self.length
351
352 - def reverse_complement(self):
353 """Gives the reverse complement of the motif.""" 354 alphabet = self.alphabet 355 if self.instances is not None: 356 instances = self.instances.reverse_complement() 357 res = Motif(instances=instances, alphabet=alphabet) 358 else: # has counts 359 res = Motif(alphabet) 360 res.counts = {} 361 res.counts["A"] = self.counts["T"][::-1] 362 res.counts["T"] = self.counts["A"][::-1] 363 res.counts["G"] = self.counts["C"][::-1] 364 res.counts["C"] = self.counts["G"][::-1] 365 res.length = self.length 366 res.__mask = self.__mask[::-1] 367 return res
368 369 @property
370 - def consensus(self):
371 """Returns the consensus sequence.""" 372 return self.counts.consensus
373 374 @property
375 - def anticonsensus(self):
376 """Returns the least probable pattern to be generated from this motif.""" 377 return self.counts.anticonsensus
378 379 @property
380 - def degenerate_consensus(self):
381 """Generate degenerate consesnsus sequence. 382 383 Following the rules adapted from 384 D. R. Cavener: "Comparison of the consensus sequence flanking 385 translational start sites in Drosophila and vertebrates." 386 Nucleic Acids Research 15(4): 1353-1361. (1987). 387 388 The same rules are used by TRANSFAC. 389 """ 390 return self.counts.degenerate_consensus
391
392 - def weblogo(self, fname, format="PNG", version="2.8.2", **kwds):
393 """Uses the Berkeley weblogo service to download and save a weblogo of itself. 394 395 Requires an internet connection. 396 397 The parameters from ``**kwds`` are passed directly to the weblogo server. 398 399 Currently, this method uses WebLogo version 3.3. 400 These are the arguments and their default values passed to 401 WebLogo 3.3; see their website at http://weblogo.threeplusone.com 402 for more information:: 403 404 'stack_width' : 'medium', 405 'stack_per_line' : '40', 406 'alphabet' : 'alphabet_dna', 407 'ignore_lower_case' : True, 408 'unit_name' : "bits", 409 'first_index' : '1', 410 'logo_start' : '1', 411 'logo_end': str(self.length), 412 'composition' : "comp_auto", 413 'percentCG' : '', 414 'scale_width' : True, 415 'show_errorbars' : True, 416 'logo_title' : '', 417 'logo_label' : '', 418 'show_xaxis': True, 419 'xaxis_label': '', 420 'show_yaxis': True, 421 'yaxis_label': '', 422 'yaxis_scale': 'auto', 423 'yaxis_tic_interval' : '1.0', 424 'show_ends' : True, 425 'show_fineprint' : True, 426 'color_scheme': 'color_auto', 427 'symbols0': '', 428 'symbols1': '', 429 'symbols2': '', 430 'symbols3': '', 431 'symbols4': '', 432 'color0': '', 433 'color1': '', 434 'color2': '', 435 'color3': '', 436 'color4': '', 437 438 """ 439 from Bio._py3k import urlopen, urlencode, Request 440 from Bio import Alphabet 441 442 if isinstance(self.alphabet, Alphabet.ProteinAlphabet): 443 alpha = "alphabet_protein" 444 elif isinstance(self.alphabet, Alphabet.RNAAlphabet): 445 alpha = "alphabet_rna" 446 elif isinstance(self.alphabet, Alphabet.DNAAlphabet): 447 alpha = "alphabet_dna" 448 else: 449 alpha = "auto" 450 451 frequencies = self.format('transfac') 452 url = 'http://weblogo.threeplusone.com/create.cgi' 453 values = {'sequences': frequencies, 454 'format': format.lower(), 455 'stack_width': 'medium', 456 'stack_per_line': '40', 457 'alphabet': alpha, 458 'ignore_lower_case': True, 459 'unit_name': "bits", 460 'first_index': '1', 461 'logo_start': '1', 462 'logo_end': str(self.length), 463 'composition': "comp_auto", 464 'percentCG': '', 465 'scale_width': True, 466 'show_errorbars': True, 467 'logo_title': '', 468 'logo_label': '', 469 'show_xaxis': True, 470 'xaxis_label': '', 471 'show_yaxis': True, 472 'yaxis_label': '', 473 'yaxis_scale': 'auto', 474 'yaxis_tic_interval': '1.0', 475 'show_ends': True, 476 'show_fineprint': True, 477 'color_scheme': 'color_auto', 478 'symbols0': '', 479 'symbols1': '', 480 'symbols2': '', 481 'symbols3': '', 482 'symbols4': '', 483 'color0': '', 484 'color1': '', 485 'color2': '', 486 'color3': '', 487 'color4': '', 488 } 489 490 values.update( 491 dict((k, "" if v is False else str(v)) for k, v in kwds.items())) 492 data = urlencode(values).encode("utf-8") 493 req = Request(url, data) 494 response = urlopen(req) 495 with open(fname, "wb") as f: 496 im = response.read() 497 f.write(im)
498
499 - def format(self, format):
500 """Returns a string representation of the Motif in a given format 501 502 Currently supported fromats: 503 - pfm : JASPAR single Position Frequency Matrix 504 - jaspar : JASPAR multiple Position Frequency Matrix 505 - transfac : TRANSFAC like files 506 507 """ 508 if format in ('pfm', 'jaspar'): 509 from Bio.motifs import jaspar 510 motifs = [self] 511 return jaspar.write(motifs, format) 512 elif format == "transfac": 513 from Bio.motifs import transfac 514 motifs = [self] 515 return transfac.write(motifs) 516 else: 517 raise ValueError("Unknown format type %s" % format)
518
519 520 -def write(motifs, format):
521 """Returns a string representation of motifs in a given format 522 523 Currently supported formats (case is ignored): 524 - pfm : JASPAR simple single Position Frequency Matrix 525 - jaspar : JASPAR multiple PFM format 526 - transfac : TRANSFAC like files 527 528 """ 529 format = format.lower() 530 if format in ("pfm", "jaspar"): 531 from Bio.motifs import jaspar 532 return jaspar.write(motifs, format) 533 elif format == "transfac": 534 from Bio.motifs import transfac 535 return transfac.write(motifs) 536 else: 537 raise ValueError("Unknown format type %s" % format)
538 539 540 if __name__ == "__main__": 541 from Bio._utils import run_doctest 542 run_doctest(verbose=0) 543