Package Bio :: Package motifs :: Module meme
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.meme

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from __future__ import print_function 
  8   
  9  from Bio.Alphabet import IUPAC 
 10  from Bio import Seq 
 11  from Bio import motifs 
 12   
 13   
14 -def read(handle):
15 """Parses the text output of the MEME program into a meme.Record object. 16 17 Example: 18 19 >>> from Bio.motifs import meme 20 >>> with open("meme.output.txt") as f: 21 ... record = meme.read(f) 22 >>> for motif in record: 23 ... for instance in motif.instances: 24 ... print(instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue) 25 26 """ 27 record = Record() 28 __read_version(record, handle) 29 __read_datafile(record, handle) 30 __read_alphabet(record, handle) 31 __read_sequences(record, handle) 32 __read_command(record, handle) 33 for line in handle: 34 if line.startswith('MOTIF 1'): 35 break 36 if record.version == '4.11.4' and line.startswith('MOTIF '): 37 break 38 else: 39 raise ValueError('Unexpected end of stream') 40 alphabet = record.alphabet 41 revcomp = 'revcomp' in record.command 42 while True: 43 motif_number, length, num_occurrences, evalue = __read_motif_statistics(line) 44 name = __read_motif_name(handle) 45 instances = __read_motif_sequences(handle, name, alphabet, length, revcomp) 46 motif = Motif(alphabet, instances) 47 motif.length = length 48 motif.num_occurrences = num_occurrences 49 motif.evalue = evalue 50 motif.name = name 51 record.append(motif) 52 assert len(record) == motif_number 53 __skip_unused_lines(handle) 54 try: 55 line = next(handle) 56 except StopIteration: 57 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 58 if line.startswith("SUMMARY OF MOTIFS"): 59 break 60 if not line.startswith('MOTIF'): 61 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 62 return record
63 64
65 -class Motif(motifs.Motif):
66 """A subclass of Motif used in parsing MEME (and MAST) output. 67 68 This subclass defines functions and data specific to MEME motifs. 69 This includes the motif name, the evalue for a motif, and its number 70 of occurrences. 71 """ 72
73 - def __init__(self, alphabet=None, instances=None):
74 motifs.Motif.__init__(self, alphabet, instances) 75 self.evalue = 0.0 76 self.num_occurrences = 0 77 self.name = None
78 79
80 -class Instance(Seq.Seq):
81 """A class describing the instances of a MEME motif, and the data thereof.""" 82
83 - def __init__(self, *args, **kwds):
84 Seq.Seq.__init__(self, *args, **kwds) 85 self.sequence_name = "" 86 self.start = 0 87 self.pvalue = 1.0 88 self.strand = 0 89 self.length = 0 90 self.motif_name = ""
91 92
93 -class Record(list):
94 """A class for holding the results of a MEME run. 95 96 A meme.Record is an object that holds the results from running 97 MEME. It implements no methods of its own. 98 99 The meme.Record class inherits from list, so you can access individual 100 motifs in the record by their index. Alternatively, you can find a motif 101 by its name: 102 103 >>> from Bio import motifs 104 >>> with open("meme.output.txt") as f: 105 ... record = motifs.parse(f, 'MEME') 106 >>> motif = record[0] 107 >>> print(motif.name) 108 Motif 1 109 >>> motif = record['Motif 1'] 110 >>> print(motif.name) 111 Motif 1 112 """ 113
114 - def __init__(self):
115 """__init__ (self)""" 116 self.version = "" 117 self.datafile = "" 118 self.command = "" 119 self.alphabet = None 120 self.sequences = []
121
122 - def __getitem__(self, key):
123 if isinstance(key, str): 124 for motif in self: 125 if motif.name == key: 126 return motif 127 else: 128 return list.__getitem__(self, key)
129 130 131 # Everything below is private 132 133
134 -def __read_version(record, handle):
135 for line in handle: 136 if line.startswith('MEME version'): 137 break 138 else: 139 raise ValueError("Improper input file. File should contain a line starting MEME version.") 140 line = line.strip() 141 ls = line.split() 142 record.version = ls[2]
143 144
145 -def __read_datafile(record, handle):
146 for line in handle: 147 if line.startswith('TRAINING SET'): 148 break 149 else: 150 raise ValueError( 151 "Unexpected end of stream: 'TRAINING SET' not found. This can happen with " + 152 "minimal MEME files (MEME databases) which are not supported yet.") 153 try: 154 line = next(handle) 155 except StopIteration: 156 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 157 if not line.startswith('****'): 158 raise ValueError("Line does not start with '****':\n%s" % line) 159 try: 160 line = next(handle) 161 except StopIteration: 162 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 163 if not line.startswith('DATAFILE'): 164 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 165 line = line.strip() 166 line = line.replace('DATAFILE= ', '') 167 record.datafile = line
168 169
170 -def __read_alphabet(record, handle):
171 try: 172 line = next(handle) 173 except StopIteration: 174 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 175 if not line.startswith('ALPHABET'): 176 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 177 line = line.strip() 178 line = line.replace('ALPHABET= ', '') 179 if line == 'ACGT': 180 al = IUPAC.unambiguous_dna 181 else: 182 al = IUPAC.protein 183 record.alphabet = al
184 185
186 -def __read_sequences(record, handle):
187 try: 188 line = next(handle) 189 except StopIteration: 190 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 191 if not line.startswith('Sequence name'): 192 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 193 try: 194 line = next(handle) 195 except StopIteration: 196 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 197 if not line.startswith('----'): 198 raise ValueError("Line does not start with '----':\n%s" % line) 199 for line in handle: 200 if line.startswith('***'): 201 break 202 line = line.strip() 203 ls = line.split() 204 record.sequences.append(ls[0]) 205 if len(ls) == 6: 206 record.sequences.append(ls[3]) 207 else: 208 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
209 210
211 -def __read_command(record, handle):
212 for line in handle: 213 if line.startswith('command:'): 214 break 215 else: 216 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 217 line = line.strip() 218 line = line.replace('command: ', '') 219 record.command = line
220 221
222 -def __read_motif_statistics(line):
223 # Depending on the version of MEME, this line either like like 224 # MOTIF 1 width = 19 sites = 3 llr = 43 E-value = 6.9e-002 225 # or like 226 # MOTIF 1 MEME width = 19 sites = 3 llr = 43 E-value = 6.9e-002 227 # or in v 4.11.4 228 # MOTIF ATTATAAAAAAA MEME-1 width = 12 sites = 5 llr = 43 E-value = 1.9e-003 229 words = line.split() 230 assert words[0] == 'MOTIF' 231 if words[2][:5] == 'MEME-': 232 motif_number = int(words[2].split('-')[1]) 233 else: 234 motif_number = int(words[1]) 235 if words[2].startswith('MEME'): 236 key_values = words[3:] 237 else: 238 key_values = words[2:] 239 keys = key_values[::3] 240 equal_signs = key_values[1::3] 241 values = key_values[2::3] 242 assert keys == ['width', 'sites', 'llr', 'E-value'] 243 for equal_sign in equal_signs: 244 assert equal_sign == '=' 245 length = int(values[0]) 246 num_occurrences = int(values[1]) 247 evalue = float(values[3]) 248 return motif_number, length, num_occurrences, evalue
249 250
251 -def __read_motif_name(handle):
252 for line in handle: 253 if 'sorted by position p-value' in line: 254 break 255 else: 256 raise ValueError('Unexpected end of stream: Failed to find motif name') 257 line = line.strip() 258 words = line.split() 259 name = " ".join(words[0:2]) 260 return name
261 262
263 -def __read_motif_sequences(handle, motif_name, alphabet, length, revcomp):
264 try: 265 line = next(handle) 266 except StopIteration: 267 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 268 if not line.startswith('---'): 269 raise ValueError("Line does not start with '---':\n%s" % line) 270 try: 271 line = next(handle) 272 except StopIteration: 273 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 274 if not line.startswith('Sequence name'): 275 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 276 try: 277 line = next(handle) 278 except StopIteration: 279 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 280 if not line.startswith('---'): 281 raise ValueError("Line does not start with '---':\n%s" % line) 282 instances = [] 283 for line in handle: 284 if line.startswith('---'): 285 break 286 line = line.strip() 287 words = line.split() 288 if revcomp: 289 strand = words.pop(1) 290 else: 291 strand = '+' 292 sequence = words[4] 293 assert len(sequence) == length 294 instance = Instance(sequence, alphabet) 295 instance.motif_name = motif_name 296 instance.sequence_name = words[0] 297 instance.start = int(words[1]) 298 instance.pvalue = float(words[2]) 299 instance.strand = strand 300 instance.length = length 301 instances.append(instance) 302 else: 303 raise ValueError('Unexpected end of stream') 304 return motifs.Instances(instances, alphabet)
305 306
307 -def __skip_unused_lines(handle):
308 for line in handle: 309 if line.startswith('log-odds matrix'): 310 break 311 else: 312 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 313 for line in handle: 314 if line.startswith('---'): 315 break 316 else: 317 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 318 for line in handle: 319 if line.startswith('letter-probability matrix'): 320 break 321 else: 322 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 323 for line in handle: 324 if line.startswith('---'): 325 break 326 else: 327 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 328 for line in handle: 329 if line.startswith('Time'): 330 break 331 else: 332 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 333 try: 334 line = next(handle) 335 except StopIteration: 336 raise ValueError('Unexpected end of stream: Expected to find blank line') 337 if line.strip(): 338 raise ValueError("Expected blank line, but got:\n%s" % line) 339 try: 340 line = next(handle) 341 except StopIteration: 342 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 343 if not line.startswith('***'): 344 raise ValueError("Line does not start with '***':\n%s" % line) 345 for line in handle: 346 if line.strip(): 347 break 348 else: 349 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 350 if not line.startswith('***'): 351 raise ValueError("Line does not start with '***':\n%s" % line)
352