Package Bio :: Package motifs :: Module meme
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.meme

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from __future__ import print_function 
  8   
  9  from Bio.Alphabet import IUPAC 
 10  from Bio import Seq 
 11  from Bio import motifs 
 12   
 13   
14 -def read(handle):
15 """Parses the text output of the MEME program into a meme.Record object. 16 17 Example: 18 19 >>> from Bio.motifs import meme 20 >>> with open("meme.output.txt") as f: 21 ... record = meme.read(f) 22 >>> for motif in record: 23 ... for instance in motif.instances: 24 ... print(instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue) 25 26 """ 27 record = Record() 28 __read_version(record, handle) 29 __read_datafile(record, handle) 30 __read_alphabet(record, handle) 31 __read_sequences(record, handle) 32 __read_command(record, handle) 33 for line in handle: 34 if line.startswith('MOTIF 1'): 35 break 36 else: 37 raise ValueError('Unexpected end of stream') 38 alphabet = record.alphabet 39 revcomp = 'revcomp' in record.command 40 while True: 41 motif_number, length, num_occurrences, evalue = __read_motif_statistics(line) 42 name = __read_motif_name(handle) 43 instances = __read_motif_sequences(handle, name, alphabet, length, revcomp) 44 motif = Motif(alphabet, instances) 45 motif.length = length 46 motif.num_occurrences = num_occurrences 47 motif.evalue = evalue 48 motif.name = name 49 record.append(motif) 50 assert len(record) == motif_number 51 __skip_unused_lines(handle) 52 try: 53 line = next(handle) 54 except StopIteration: 55 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 56 if line.startswith("SUMMARY OF MOTIFS"): 57 break 58 if not line.startswith('MOTIF'): 59 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 60 return record
61 62
63 -class Motif(motifs.Motif):
64 """A subclass of Motif used in parsing MEME (and MAST) output. 65 66 This subclass defines functions and data specific to MEME motifs. 67 This includes the motif name, the evalue for a motif, and its number 68 of occurrences. 69 """
70 - def __init__(self, alphabet=None, instances=None):
71 motifs.Motif.__init__(self, alphabet, instances) 72 self.evalue = 0.0 73 self.num_occurrences = 0 74 self.name = None
75 76
77 -class Instance(Seq.Seq):
78 """A class describing the instances of a MEME motif, and the data thereof. 79 """
80 - def __init__(self, *args, **kwds):
81 Seq.Seq.__init__(self, *args, **kwds) 82 self.sequence_name = "" 83 self.start = 0 84 self.pvalue = 1.0 85 self.strand = 0 86 self.length = 0 87 self.motif_name = ""
88 89
90 -class Record(list):
91 """A class for holding the results of a MEME run. 92 93 A meme.Record is an object that holds the results from running 94 MEME. It implements no methods of its own. 95 96 The meme.Record class inherits from list, so you can access individual 97 motifs in the record by their index. Alternatively, you can find a motif 98 by its name: 99 100 >>> from Bio import motifs 101 >>> with open("meme.output.txt") as f: 102 ... record = motifs.parse(f, 'MEME') 103 >>> motif = record[0] 104 >>> print(motif.name) 105 Motif 1 106 >>> motif = record['Motif 1'] 107 >>> print(motif.name) 108 Motif 1 109 """ 110
111 - def __init__(self):
112 """__init__ (self)""" 113 self.version = "" 114 self.datafile = "" 115 self.command = "" 116 self.alphabet = None 117 self.sequences = []
118
119 - def __getitem__(self, key):
120 if isinstance(key, str): 121 for motif in self: 122 if motif.name == key: 123 return motif 124 else: 125 return list.__getitem__(self, key)
126 127 128 # Everything below is private 129 130
131 -def __read_version(record, handle):
132 for line in handle: 133 if line.startswith('MEME version'): 134 break 135 else: 136 raise ValueError("Improper input file. File should contain a line starting MEME version.") 137 line = line.strip() 138 ls = line.split() 139 record.version = ls[2]
140 141
142 -def __read_datafile(record, handle):
143 for line in handle: 144 if line.startswith('TRAINING SET'): 145 break 146 else: 147 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.") 148 try: 149 line = next(handle) 150 except StopIteration: 151 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 152 if not line.startswith('****'): 153 raise ValueError("Line does not start with '****':\n%s" % line) 154 try: 155 line = next(handle) 156 except StopIteration: 157 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 158 if not line.startswith('DATAFILE'): 159 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 160 line = line.strip() 161 line = line.replace('DATAFILE= ', '') 162 record.datafile = line
163 164
165 -def __read_alphabet(record, handle):
166 try: 167 line = next(handle) 168 except StopIteration: 169 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 170 if not line.startswith('ALPHABET'): 171 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 172 line = line.strip() 173 line = line.replace('ALPHABET= ', '') 174 if line == 'ACGT': 175 al = IUPAC.unambiguous_dna 176 else: 177 al = IUPAC.protein 178 record.alphabet = al
179 180
181 -def __read_sequences(record, handle):
182 try: 183 line = next(handle) 184 except StopIteration: 185 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 186 if not line.startswith('Sequence name'): 187 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 188 try: 189 line = next(handle) 190 except StopIteration: 191 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 192 if not line.startswith('----'): 193 raise ValueError("Line does not start with '----':\n%s" % line) 194 for line in handle: 195 if line.startswith('***'): 196 break 197 line = line.strip() 198 ls = line.split() 199 record.sequences.append(ls[0]) 200 if len(ls) == 6: 201 record.sequences.append(ls[3]) 202 else: 203 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
204 205
206 -def __read_command(record, handle):
207 for line in handle: 208 if line.startswith('command:'): 209 break 210 else: 211 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 212 line = line.strip() 213 line = line.replace('command: ', '') 214 record.command = line
215 216
217 -def __read_motif_statistics(line):
218 # Depending on the version of MEME, this line either like like 219 # MOTIF 1 width = 19 sites = 3 llr = 43 E-value = 6.9e-002 220 # or like 221 # MOTIF 1 MEME width = 19 sites = 3 llr = 43 E-value = 6.9e-002 222 words = line.split() 223 assert words[0] == 'MOTIF' 224 motif_number = int(words[1]) 225 if words[2] == 'MEME': 226 key_values = words[3:] 227 else: 228 key_values = words[2:] 229 keys = key_values[::3] 230 equal_signs = key_values[1::3] 231 values = key_values[2::3] 232 assert keys == ['width', 'sites', 'llr', 'E-value'] 233 for equal_sign in equal_signs: 234 assert equal_sign == '=' 235 length = int(values[0]) 236 num_occurrences = int(values[1]) 237 evalue = float(values[3]) 238 return motif_number, length, num_occurrences, evalue
239 240
241 -def __read_motif_name(handle):
242 for line in handle: 243 if 'sorted by position p-value' in line: 244 break 245 else: 246 raise ValueError('Unexpected end of stream: Failed to find motif name') 247 line = line.strip() 248 words = line.split() 249 name = " ".join(words[0:2]) 250 return name
251 252
253 -def __read_motif_sequences(handle, motif_name, alphabet, length, revcomp):
254 try: 255 line = next(handle) 256 except StopIteration: 257 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 258 if not line.startswith('---'): 259 raise ValueError("Line does not start with '---':\n%s" % line) 260 try: 261 line = next(handle) 262 except StopIteration: 263 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 264 if not line.startswith('Sequence name'): 265 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 266 try: 267 line = next(handle) 268 except StopIteration: 269 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 270 if not line.startswith('---'): 271 raise ValueError("Line does not start with '---':\n%s" % line) 272 instances = [] 273 for line in handle: 274 if line.startswith('---'): 275 break 276 line = line.strip() 277 words = line.split() 278 if revcomp: 279 strand = words.pop(1) 280 else: 281 strand = '+' 282 sequence = words[4] 283 assert len(sequence) == length 284 instance = Instance(sequence, alphabet) 285 instance.motif_name = motif_name 286 instance.sequence_name = words[0] 287 instance.start = int(words[1]) 288 instance.pvalue = float(words[2]) 289 instance.strand = strand 290 instance.length = length 291 instances.append(instance) 292 else: 293 raise ValueError('Unexpected end of stream') 294 return motifs.Instances(instances, alphabet)
295 296
297 -def __skip_unused_lines(handle):
298 for line in handle: 299 if line.startswith('log-odds matrix'): 300 break 301 else: 302 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 303 for line in handle: 304 if line.startswith('---'): 305 break 306 else: 307 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 308 for line in handle: 309 if line.startswith('letter-probability matrix'): 310 break 311 else: 312 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 313 for line in handle: 314 if line.startswith('---'): 315 break 316 else: 317 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 318 for line in handle: 319 if line.startswith('Time'): 320 break 321 else: 322 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 323 try: 324 line = next(handle) 325 except StopIteration: 326 raise ValueError('Unexpected end of stream: Expected to find blank line') 327 if line.strip(): 328 raise ValueError("Expected blank line, but got:\n%s" % line) 329 try: 330 line = next(handle) 331 except StopIteration: 332 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 333 if not line.startswith('***'): 334 raise ValueError("Line does not start with '***':\n%s" % line) 335 for line in handle: 336 if line.strip(): 337 break 338 else: 339 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 340 if not line.startswith('***'): 341 raise ValueError("Line does not start with '***':\n%s" % line)
342