Package Bio :: Package SearchIO :: Package HmmerIO :: Module hmmer2_text
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.HmmerIO.hmmer2_text

  1  # Copyright 2012 by Kai Blin. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for HMMER 2 text output.""" 
  7   
  8  import re 
  9   
 10  from Bio._py3k import _as_bytes, _bytes_to_string 
 11  from Bio._utils import read_forward 
 12  from Bio.Alphabet import generic_protein 
 13  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 14   
 15  from ._base import _BaseHmmerTextIndexer 
 16   
 17  __all__ = ('Hmmer2TextParser', 'Hmmer2TextIndexer') 
 18   
 19   
 20  _HSP_ALIGN_LINE = re.compile(r'(\S+):\s+domain (\d+) of (\d+)') 
 21   
 22   
23 -class _HitPlaceholder(object):
24 - def createHit(self, hsp_list):
25 hit = Hit(hsp_list) 26 hit.id_ = self.id_ 27 hit.evalue = self.evalue 28 hit.bitscore = self.bitscore 29 if self.description: 30 hit.description = self.description 31 hit.domain_obs_num = self.domain_obs_num 32 return hit
33 34
35 -class Hmmer2TextParser(object):
36 """Iterator for the HMMER 2.0 text output.""" 37
38 - def __init__(self, handle):
39 self.handle = handle 40 self.buf = [] 41 self._meta = self.parse_preamble()
42
43 - def __iter__(self):
44 for qresult in self.parse_qresult(): 45 qresult.program = self._meta.get('program') 46 qresult.target = self._meta.get('target') 47 qresult.version = self._meta.get('version') 48 yield qresult
49
50 - def read_next(self, rstrip=True):
51 """Return the next non-empty line, trailing whitespace removed""" 52 if len(self.buf) > 0: 53 return self.buf.pop() 54 self.line = self.handle.readline() 55 while self.line and rstrip and not self.line.strip(): 56 self.line = self.handle.readline() 57 if self.line: 58 if rstrip: 59 self.line = self.line.rstrip() 60 return self.line
61
62 - def push_back(self, line):
63 """Un-read a line that should not be parsed yet""" 64 self.buf.append(line)
65
66 - def parse_key_value(self):
67 """Parse key-value pair separated by colon (:)""" 68 key, value = self.line.split(':', 1) 69 return key.strip(), value.strip()
70
71 - def parse_preamble(self):
72 """Parse HMMER2 preamble.""" 73 meta = {} 74 state = "GENERIC" 75 while self.read_next(): 76 if state == "GENERIC": 77 if self.line.startswith('hmm'): 78 meta['program'] = self.line.split('-')[0].strip() 79 elif self.line.startswith('HMMER is'): 80 continue 81 elif self.line.startswith('HMMER'): 82 meta['version'] = self.line.split()[1] 83 elif self.line.count('-') == 36: 84 state = "OPTIONS" 85 continue 86 87 assert state == "OPTIONS" 88 assert 'program' in meta 89 90 if self.line.count('-') == 32: 91 break 92 93 key, value = self.parse_key_value() 94 if meta['program'] == 'hmmsearch': 95 if key == 'Sequence database': 96 meta['target'] = value 97 continue 98 elif meta['program'] == 'hmmpfam': 99 if key == 'HMM file': 100 meta['target'] = value 101 continue 102 meta[key] = value 103 104 return meta
105
106 - def parse_qresult(self):
107 """Parse a HMMER2 query block.""" 108 while self.read_next(): 109 if not self.line.startswith('Query'): 110 raise StopIteration() 111 _, id_ = self.parse_key_value() 112 self.qresult = QueryResult(id=id_) 113 114 description = None 115 116 while self.read_next() and not self.line.startswith('Scores'): 117 if self.line.startswith('Accession'): 118 self.qresult.accession = self.parse_key_value()[1] 119 if self.line.startswith('Description'): 120 description = self.parse_key_value()[1] 121 122 hit_placeholders = self.parse_hits() 123 if len(hit_placeholders) > 0: 124 self.parse_hsps(hit_placeholders) 125 self.parse_hsp_alignments() 126 127 while not self.line.startswith('Query'): 128 self.read_next() 129 if not self.line: 130 break 131 self.buf.append(self.line) 132 133 if description is not None: 134 self.qresult.description = description 135 yield self.qresult
136
137 - def parse_hits(self):
138 """Parse a HMMER2 hit block, beginning with the hit table.""" 139 hit_placeholders = [] 140 while self.read_next(): 141 if self.line.startswith('Parsed'): 142 break 143 if self.line.find('no hits') > -1: 144 break 145 146 if self.line.startswith('Sequence') or \ 147 self.line.startswith('Model') or \ 148 self.line.startswith('-------- '): 149 continue 150 151 fields = self.line.split() 152 id_ = fields.pop(0) 153 domain_obs_num = int(fields.pop()) 154 evalue = float(fields.pop()) 155 bitscore = float(fields.pop()) 156 description = ' '.join(fields).strip() 157 158 hit = _HitPlaceholder() 159 hit.id_ = id_ 160 hit.evalue = evalue 161 hit.bitscore = bitscore 162 hit.description = description 163 hit.domain_obs_num = domain_obs_num 164 hit_placeholders.append(hit) 165 166 return hit_placeholders
167
168 - def parse_hsps(self, hit_placeholders):
169 """Parse a HMMER2 hsp block, beginning with the hsp table.""" 170 # HSPs may occur in different order than the hits 171 # so store Hit objects separately first 172 unordered_hits = {} 173 while self.read_next(): 174 if self.line.startswith('Alignments') or \ 175 self.line.startswith('Histogram') or \ 176 self.line == '//': 177 break 178 if self.line.startswith('Model') or \ 179 self.line.startswith('Sequence') or \ 180 self.line.startswith('--------'): 181 continue 182 183 id_, domain, seq_f, seq_t, seq_compl, hmm_f, hmm_t, hmm_compl, \ 184 score, evalue = self.line.split() 185 186 frag = HSPFragment(id_, self.qresult.id) 187 frag.alphabet = generic_protein 188 if self._meta['program'] == 'hmmpfam': 189 frag.hit_start = int(hmm_f) - 1 190 frag.hit_end = int(hmm_t) 191 frag.query_start = int(seq_f) - 1 192 frag.query_end = int(seq_t) 193 elif self._meta['program'] == 'hmmsearch': 194 frag.query_start = int(hmm_f) - 1 195 frag.query_end = int(hmm_t) 196 frag.hit_start = int(seq_f) - 1 197 frag.hit_end = int(seq_t) 198 199 hsp = HSP([frag]) 200 hsp.evalue = float(evalue) 201 hsp.bitscore = float(score) 202 hsp.domain_index = int(domain.split('/')[0]) 203 if self._meta['program'] == 'hmmpfam': 204 hsp.hit_endtype = hmm_compl 205 hsp.query_endtype = seq_compl 206 elif self._meta['program'] == 'hmmsearch': 207 hsp.query_endtype = hmm_compl 208 hsp.hit_endtype = seq_compl 209 210 if id_ not in unordered_hits: 211 placeholder = [p for p in hit_placeholders if p.id_ == id_][0] 212 hit = placeholder.createHit([hsp]) 213 unordered_hits[id_] = hit 214 else: 215 hit = unordered_hits[id_] 216 hsp.hit_description = hit.description 217 hit.append(hsp) 218 219 # The placeholder list is in the correct order, so use that order for 220 # the Hit objects in the qresult 221 for p in hit_placeholders: 222 self.qresult.append(unordered_hits[p.id_])
223
224 - def parse_hsp_alignments(self):
225 """Parse a HMMER2 HSP alignment block.""" 226 if not self.line.startswith('Alignments'): 227 return 228 229 while self.read_next(): 230 if self.line == '//' or self.line.startswith('Histogram'): 231 break 232 233 match = re.search(_HSP_ALIGN_LINE, self.line) 234 if match is None: 235 continue 236 237 id_ = match.group(1) 238 idx = int(match.group(2)) 239 num = int(match.group(3)) 240 241 hit = self.qresult[id_] 242 if hit.domain_obs_num != num: 243 continue 244 245 frag = hit[idx - 1][0] 246 247 hmmseq = '' 248 consensus = '' 249 otherseq = '' 250 structureseq = '' 251 pad = 0 252 while self.read_next() and self.line.startswith(' '): 253 # if there's structure information, parse that 254 if self.line[16:18] == 'CS': 255 structureseq += self.line[19:].strip() 256 257 if not self.read_next(): 258 break 259 260 # skip the *-> start marker if it exists 261 if self.line[19] == '*': 262 seq = self.line[22:] 263 pad = 3 264 else: 265 seq = self.line[19:] 266 pad = 0 267 268 # get rid of the end marker 269 if seq.endswith('<-*'): 270 seq = seq[:-3] 271 272 hmmseq += seq 273 line_len = len(seq) 274 if not self.read_next(rstrip=False): 275 break 276 consensus += self.line[19 + pad:19 + pad + line_len] 277 # If there's no consensus sequence, hmmer2 doesn't 278 # bother to put spaces here, so add extra padding 279 extra_padding = len(hmmseq) - len(consensus) 280 consensus += ' ' * extra_padding 281 282 if not self.read_next(): 283 break 284 otherseq += self.line[19:].split()[0].strip() 285 286 self.push_back(self.line) 287 288 # add similarity sequence to annotation 289 frag.aln_annotation['similarity'] = consensus 290 291 # if there's structure information, add it to the fragment 292 if structureseq: 293 frag.aln_annotation['CS'] = structureseq 294 295 if self._meta['program'] == 'hmmpfam': 296 frag.hit = hmmseq 297 frag.query = otherseq 298 else: 299 frag.hit = otherseq 300 frag.query = hmmseq
301 302
303 -class Hmmer2TextIndexer(_BaseHmmerTextIndexer):
304 """Indexer for hmmer2-text format.""" 305 306 _parser = Hmmer2TextParser 307 qresult_start = _as_bytes('Query') 308 # qresults_ends for hmmpfam and hmmsearch 309 # need to anticipate both since hmmsearch have different query end mark 310 qresult_end = _as_bytes('//') 311
312 - def __iter__(self):
313 handle = self._handle 314 handle.seek(0) 315 start_offset = handle.tell() 316 regex_id = re.compile(_as_bytes(r'Query\s*(?:sequence|HMM)?:\s*(.*)')) 317 318 # determine flag for hmmsearch 319 is_hmmsearch = False 320 line = read_forward(handle) 321 if line.startswith(_as_bytes('hmmsearch')): 322 is_hmmsearch = True 323 324 while True: 325 end_offset = handle.tell() 326 327 if line.startswith(self.qresult_start): 328 regx = re.search(regex_id, line) 329 qresult_key = regx.group(1).strip() 330 # qresult start offset is the offset of this line 331 # (starts with the start mark) 332 start_offset = end_offset - len(line) 333 elif line.startswith(self.qresult_end): 334 yield _bytes_to_string(qresult_key), start_offset, 0 335 start_offset = end_offset 336 elif not line: 337 # HACK: since hmmsearch can only have one query result 338 if is_hmmsearch: 339 yield _bytes_to_string(qresult_key), start_offset, 0 340 break 341 342 line = read_forward(handle)
343 344 345 # if not used as a module, run the doctest 346 if __name__ == "__main__": 347 from Bio._utils import run_doctest 348 run_doctest() 349