Package Bio :: Package Motif :: Package Parsers :: Module MEME
[hide private]
[frames] | no frames]

Source Code for Module Bio.Motif.Parsers.MEME

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from __future__ import print_function 
  8   
  9  from Bio.Alphabet import IUPAC 
 10  from Bio import Seq 
 11  import re 
 12  from math import sqrt 
 13  import sys 
 14  from Bio.Motif import Motif 
 15   
 16   
 17   
18 -def read(handle):
19 """Parses the text output of the MEME program into MEME.Record object. 20 21 Example: 22 23 >>> from Bio.Motif.Parsers import MEME 24 >>> with open("meme.output.txt") as f: 25 ... record = MEME.read(f) 26 >>> for motif in record.motifs: 27 ... for instance in motif.instances: 28 ... print(instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue) 29 30 """ 31 record = MEMERecord() 32 __read_version(record, handle) 33 __read_datafile(record, handle) 34 __read_alphabet(record, handle) 35 __read_sequence_names(record, handle) 36 __read_command(record, handle) 37 for line in handle: 38 if line.startswith('MOTIF 1'): 39 break 40 else: 41 raise ValueError('Unexpected end of stream') 42 while True: 43 motif = __create_motif(line) 44 motif.alphabet = record.alphabet 45 record.motifs.append(motif) 46 __read_motif_name(motif, handle) 47 __read_motif_sequences(motif, handle, 'revcomp' in record.command) 48 __skip_unused_lines(handle) 49 try: 50 line = next(handle) 51 except StopIteration: 52 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 53 if line.startswith("SUMMARY OF MOTIFS"): 54 break 55 if not line.startswith('MOTIF'): 56 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 57 return record
58 59
60 -class MEMEMotif (Motif):
61 """A subclass of Motif used in parsing MEME (and MAST) output. 62 63 This sublcass defines functions and data specific to MEME motifs. 64 This includes the evalue for a motif and the PSSM of the motif. 65 66 Methods: 67 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values. 68 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies 69 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position. 70 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif 71 """
72 - def __init__ (self):
73 Motif.__init__(self) 74 self.evalue = 0.0
75
76 - def _numoccurrences (self, number):
77 if isinstance(number, int): 78 self.num_occurrences = number 79 else: 80 number = int(number) 81 self.num_occurrences = number
82
83 - def get_instance_by_name (self, name):
84 for i in self.instances: 85 if i.sequence_name == name: 86 return i 87 return None
88
89 - def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
90 inst = MEMEInstance(sequence, self.alphabet) 91 inst._pvalue(pvalue) 92 inst._seqname(name) 93 inst._start(start) 94 inst._strand(strand) 95 if self.length: 96 inst._length(self.length) 97 else: 98 inst._length(len(sequence)) 99 if self.name: 100 inst._motifname(self.name) 101 self.add_instance(inst)
102
103 - def _evalue (self, evalue):
104 if isinstance(evalue, float): 105 self.evalue = evalue 106 else: 107 evalue = float(evalue) 108 self.evalue = evalue
109 110
111 -class MEMEInstance(Seq.Seq):
112 """A class describing the instances of a MEME motif, and the data thereof. 113 """
114 - def __init__ (self,*args,**kwds):
115 Seq.Seq.__init__(self,*args,**kwds) 116 self.sequence_name = "" 117 self.start = 0 118 self.pvalue = 1.0 119 self.strand = 0 120 self.length = 0 121 self.motif_name = ""
122 123
124 - def _seqname (self, name):
125 self.sequence_name = name
126
127 - def _motifname (self, name):
128 self.motif_name = name
129
130 - def _start (self, start):
131 start = int(start) 132 self.start = start
133
134 - def _pvalue (self, pval):
135 pval = float(pval) 136 self.pvalue = pval
137
138 - def _score (self, score):
139 score = float(score) 140 self.score = score
141
142 - def _strand (self, strand):
143 self.strand = strand
144
145 - def _length (self, length):
146 self.length = length
147 148
149 -class MEMERecord(object):
150 """A class for holding the results of a MEME run. 151 152 A MEMERecord is an object that holds the results from running 153 MEME. It implements no methods of its own. 154 155 """
156 - def __init__ (self):
157 """__init__ (self)""" 158 self.motifs = [] 159 self.version = "" 160 self.datafile = "" 161 self.command = "" 162 self.alphabet = None 163 self.sequence_names = []
164
165 - def get_motif_by_name (self, name):
166 for m in self.motifs: 167 if m.name == name: 168 return m
169 170 171 # Everything below is private 172 173
174 -def __read_version(record, handle):
175 for line in handle: 176 if line.startswith('MEME version'): 177 break 178 else: 179 raise ValueError("Improper input file. File should contain a line starting MEME version.") 180 line = line.strip() 181 ls = line.split() 182 record.version = ls[2]
183 184
185 -def __read_datafile(record, handle):
186 for line in handle: 187 if line.startswith('TRAINING SET'): 188 break 189 else: 190 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.") 191 try: 192 line = next(handle) 193 except StopIteration: 194 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 195 if not line.startswith('****'): 196 raise ValueError("Line does not start with '****':\n%s" % line) 197 try: 198 line = next(handle) 199 except StopIteration: 200 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 201 if not line.startswith('DATAFILE'): 202 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 203 line = line.strip() 204 line = line.replace('DATAFILE= ', '') 205 record.datafile = line
206 207
208 -def __read_alphabet(record, handle):
209 try: 210 line = next(handle) 211 except StopIteration: 212 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 213 if not line.startswith('ALPHABET'): 214 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 215 line = line.strip() 216 line = line.replace('ALPHABET= ', '') 217 if line == 'ACGT': 218 al = IUPAC.unambiguous_dna 219 else: 220 al = IUPAC.protein 221 record.alphabet = al
222 223
224 -def __read_sequence_names(record, handle):
225 try: 226 line = next(handle) 227 except StopIteration: 228 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 229 if not line.startswith('Sequence name'): 230 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 231 try: 232 line = next(handle) 233 except StopIteration: 234 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 235 if not line.startswith('----'): 236 raise ValueError("Line does not start with '----':\n%s" % line) 237 for line in handle: 238 if line.startswith('***'): 239 break 240 line = line.strip() 241 ls = line.split() 242 record.sequence_names.append(ls[0]) 243 if len(ls) == 6: 244 record.sequence_names.append(ls[3]) 245 else: 246 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
247 248
249 -def __read_command(record, handle):
250 for line in handle: 251 if line.startswith('command:'): 252 break 253 else: 254 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 255 line = line.strip() 256 line = line.replace('command: ', '') 257 record.command = line
258 259
260 -def __create_motif(line):
261 line = line[5:].strip() 262 ls = line.split() 263 motif = MEMEMotif() 264 motif.length = int(ls[3]) 265 motif._numoccurrences(ls[6]) 266 motif._evalue(ls[12]) 267 return motif
268 269
270 -def __read_motif_name(motif, handle):
271 for line in handle: 272 if 'sorted by position p-value' in line: 273 break 274 else: 275 raise ValueError('Unexpected end of stream: Failed to find motif name') 276 line = line.strip() 277 ls = line.split() 278 name = " ".join(ls[0:2]) 279 motif.name=name
280 281
282 -def __read_motif_sequences(motif, handle, rv):
283 try: 284 line = next(handle) 285 except StopIteration: 286 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 287 if not line.startswith('---'): 288 raise ValueError("Line does not start with '---':\n%s" % line) 289 try: 290 line = next(handle) 291 except StopIteration: 292 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 293 if not line.startswith('Sequence name'): 294 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 295 try: 296 line = next(handle) 297 except StopIteration: 298 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 299 if not line.startswith('---'): 300 raise ValueError("Line does not start with '---':\n%s" % line) 301 for line in handle: 302 if line.startswith('---'): 303 break 304 line = line.strip() 305 ls = line.split() 306 if rv: 307 #seq = Seq.Seq(ls[5], record.alphabet) 308 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 309 else: 310 #seq = Seq.Seq(ls[4], record.alphabet) 311 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2]) 312 else: 313 raise ValueError('Unexpected end of stream')
314 315
316 -def __skip_unused_lines(handle):
317 for line in handle: 318 if line.startswith('log-odds matrix'): 319 break 320 else: 321 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 322 for line in handle: 323 if line.startswith('---'): 324 break 325 else: 326 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 327 for line in handle: 328 if line.startswith('letter-probability matrix'): 329 break 330 else: 331 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 332 for line in handle: 333 if line.startswith('---'): 334 break 335 else: 336 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 337 for line in handle: 338 if line.startswith('Time'): 339 break 340 else: 341 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 342 try: 343 line = next(handle) 344 except StopIteration: 345 raise ValueError('Unexpected end of stream: Expected to find blank line') 346 if line.strip(): 347 raise ValueError("Expected blank line, but got:\n%s" % line) 348 try: 349 line = next(handle) 350 except StopIteration: 351 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 352 if not line.startswith('***'): 353 raise ValueError("Line does not start with '***':\n%s" % line) 354 for line in handle: 355 if line.strip(): 356 break 357 else: 358 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 359 if not line.startswith('***'): 360 raise ValueError("Line does not start with '***':\n%s" % line)
361