Package Bio :: Package SearchIO :: Package ExonerateIO :: Module _base
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.ExonerateIO._base

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO abstract base parser for Exonerate standard output format.""" 
  7   
  8  import re 
  9   
 10  from Bio.SearchIO._index import SearchIndexer 
 11  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 12  from Bio.SeqUtils import seq1 
 13   
 14   
 15  # strand char-value mapping 
 16  _STRAND_MAP = {'+': 1, '-': -1, '.': 0} 
 17   
 18  _RE_SHIFTS = re.compile(r'(#+)') 
 19  # regex for checking whether a vulgar line has protein/translated components 
 20  _RE_TRANS = re.compile(r'[53ISCF]') 
 21   
 22   
23 -def _set_frame(frag):
24 """Sets the HSPFragment frames.""" 25 frag.hit_frame = (frag.hit_start % 3 + 1) * frag.hit_strand 26 frag.query_frame = (frag.query_start % 3 + 1) * frag.query_strand
27 28
29 -def _make_triplets(seq):
30 """Splits a string into a list containing triplets of the original 31 string.""" 32 return [seq[3*i:3*(i+1)] for i in range(len(seq) // 3)]
33 34
35 -def _adjust_aa_seq(fraglist):
36 """Transforms three-letter amino acid codes into one-letters in the 37 given HSPFragments.""" 38 hsp_hstart = fraglist[0].hit_start 39 hsp_qstart = fraglist[0].query_start 40 for frag in fraglist: 41 assert frag.query_strand == 0 or frag.hit_strand == 0 42 # fragment should have a length that is a multiple of 3 43 assert len(frag) % 3 == 0 44 # hit step may be -1 as we're aligning to DNA 45 hstep = 1 if frag.hit_strand >= 0 else -1 46 # get one letter codes 47 # and replace gap codon markers and termination characters 48 custom_map = {'***': '*', '<->': '-'} 49 50 hseq1 = seq1(str(frag.hit.seq), custom_map=custom_map) 51 hstart = hsp_hstart 52 hend = hstart + len(hseq1.replace('-', '')) * hstep 53 54 qseq1 = seq1(str(frag.query.seq), custom_map=custom_map) 55 qstart = hsp_qstart 56 qend = qstart + len(qseq1.replace('-', '')) 57 58 # replace the old frag sequences with the new ones 59 frag.hit = None 60 frag.query = None 61 frag.hit = hseq1 62 frag.query = qseq1 63 64 # set coordinates for the protein sequence 65 if frag.query_strand == 0: 66 frag.query_start, frag.query_end = qstart, qend 67 elif frag.hit_strand == 0: 68 frag.hit_start, frag.hit_end = hstart, hend 69 70 # update alignment annotation 71 # by turning them into list of triplets 72 for annot, annotseq in frag.aln_annotation.items(): 73 frag.aln_annotation[annot] = _make_triplets(annotseq) 74 75 # update values for next iteration 76 hsp_hstart, hsp_qstart = hend, qend 77 78 return fraglist
79 80
81 -def _split_fragment(frag):
82 """Splits one HSPFragment containing frame-shifted alignment into two.""" 83 # given an HSPFragment object with frameshift(s), this method splits it 84 # into fragments without frameshifts by sequentially chopping it off 85 # starting from the beginning 86 simil = frag.aln_annotation['similarity'] 87 # we should have at least 1 frame shift for splitting 88 assert simil.count('#') > 0 89 90 split_frags = [] 91 qstep = 1 if frag.query_strand >= 0 else -1 92 hstep = 1 if frag.hit_strand >= 0 else -1 93 qpos = min(frag.query_range) if qstep >= 0 else max(frag.query_range) 94 hpos = min(frag.hit_range) if qstep >= 0 else max(frag.hit_range) 95 abs_pos = 0 96 # split according to hit, then query 97 while simil: 98 99 try: 100 shifts = re.search(_RE_SHIFTS, simil).group(1) 101 s_start = simil.find(shifts) 102 s_stop = s_start + len(shifts) 103 split = frag[abs_pos:abs_pos+s_start] 104 except AttributeError: # no '#' in simil, i.e. last frag 105 shifts = '' 106 s_start = 0 107 s_stop = len(simil) 108 split = frag[abs_pos:] 109 110 # coordinates for the split strand 111 qstart, hstart = qpos, hpos 112 qpos += (len(split) - sum(str(split.query.seq).count(x) 113 for x in ('-', '<', '>'))) * qstep 114 hpos += (len(split) - sum(str(split.hit.seq).count(x) 115 for x in ('-', '<', '>'))) * hstep 116 117 split.hit_start = min(hstart, hpos) 118 split.query_start = min(qstart, qpos) 119 split.hit_end = max(hstart, hpos) 120 split.query_end = max(qstart, qpos) 121 122 # account for frameshift length 123 abs_slice = slice(abs_pos+s_start, abs_pos+s_stop) 124 if len(frag.aln_annotation) == 2: 125 seqs = (str(frag[abs_slice].query.seq), 126 str(frag[abs_slice].hit.seq)) 127 elif len(frag.aln_annotation) == 3: 128 seqs = (frag[abs_slice].aln_annotation['query_annotation'], 129 frag[abs_slice].aln_annotation['hit_annotation'],) 130 if '#' in seqs[0]: 131 qpos += len(shifts) * qstep 132 elif '#' in seqs[1]: 133 hpos += len(shifts) * hstep 134 135 # set frame 136 _set_frame(split) 137 split_frags.append(split) 138 # set similarity string and absolute position for the next loop 139 simil = simil[s_stop:] 140 abs_pos += s_stop 141 142 return split_frags
143 144
145 -def _create_hsp(hid, qid, hspd):
146 """Returns a list of HSP objects from the given parsed HSP values.""" 147 frags = [] 148 # we are iterating over query_ranges, but hit_ranges works just as well 149 for idx, qcoords in enumerate(hspd['query_ranges']): 150 # get sequences, create object 151 hseqlist = hspd.get('hit') 152 hseq = '' if hseqlist is None else hseqlist[idx] 153 qseqlist = hspd.get('query') 154 qseq = '' if qseqlist is None else qseqlist[idx] 155 frag = HSPFragment(hid, qid, hit=hseq, query=qseq) 156 # coordinates 157 frag.query_start = qcoords[0] 158 frag.query_end = qcoords[1] 159 frag.hit_start = hspd['hit_ranges'][idx][0] 160 frag.hit_end = hspd['hit_ranges'][idx][1] 161 # alignment annotation 162 try: 163 aln_annot = hspd.get('aln_annotation', {}) 164 for key, value in aln_annot.items(): 165 frag.aln_annotation[key] = value[idx] 166 except IndexError: 167 pass 168 # strands 169 frag.query_strand = hspd['query_strand'] 170 frag.hit_strand = hspd['hit_strand'] 171 # and append the hsp object to the list 172 if frag.aln_annotation.get('similarity') is not None: 173 if '#' in frag.aln_annotation['similarity']: 174 frags.extend(_split_fragment(frag)) 175 continue 176 # try to set frame if there are translation in the alignment 177 if len(frag.aln_annotation) > 1 or \ 178 frag.query_strand == 0 or \ 179 ('vulgar_comp' in hspd and re.search(_RE_TRANS, hspd['vulgar_comp'])): 180 _set_frame(frag) 181 182 frags.append(frag) 183 184 # if the query is protein, we need to change the hit and query sequences 185 # from three-letter amino acid codes to one letter, and adjust their 186 # coordinates accordingly 187 if len(frags[0].aln_annotation) == 2: # 2 annotations == protein query 188 frags = _adjust_aa_seq(frags) 189 190 hsp = HSP(frags) 191 # set hsp-specific attributes 192 for attr in ('score', 'hit_split_codons', 'query_split_codons', 193 'model', 'vulgar_comp', 'cigar_comp', 'alphabet'): 194 if attr in hspd: 195 setattr(hsp, attr, hspd[attr]) 196 197 return hsp
198 199
200 -def _parse_hit_or_query_line(line):
201 """Parse the 'Query:' line of exonerate alignment outputs.""" 202 try: 203 mark, id, desc = line.split(' ', 2) 204 except ValueError: # no desc 205 mark, id = line.split(' ', 1) 206 desc = '' 207 208 return id, desc
209 210
211 -class _BaseExonerateParser(object):
212 213 """Abstract iterator for exonerate format.""" 214 215 _ALN_MARK = None 216
217 - def __init__(self, handle):
218 self.handle = handle 219 self.has_c4_alignment = False
220
221 - def __iter__(self):
222 # read line until the first alignment block or cigar/vulgar lines 223 while True: 224 self.line = self.handle.readline() 225 # flag for human-readable alignment block 226 if self.line.startswith('C4 Alignment:') and not \ 227 self.has_c4_alignment: 228 self.has_c4_alignment = True 229 if self.line.startswith('C4 Alignment:') or \ 230 self.line.startswith('vulgar:') or \ 231 self.line.startswith('cigar:'): 232 break 233 elif not self.line or self.line.startswith('-- completed '): 234 raise StopIteration 235 236 for qresult in self._parse_qresult(): 237 qresult.program = 'exonerate' 238 # HACK: so that all descriptions are set 239 qresult.description = qresult.description 240 for hit in qresult: 241 hit.description = hit.description 242 yield qresult
243
244 - def read_until(self, bool_func):
245 """Reads the file handle until the given bool function returns True.""" 246 while True: 247 if not self.line or bool_func(self.line): 248 return 249 else: 250 self.line = self.handle.readline()
251
252 - def parse_alignment_block(self, header):
253 raise NotImplementedError("Subclass must implement this")
254
255 - def _parse_alignment_header(self):
256 # read all header lines and store them 257 aln_header = [] 258 # header is everything before the first empty line 259 while self.line.strip(): 260 aln_header.append(self.line.strip()) 261 self.line = self.handle.readline() 262 # then parse them 263 qresult, hit, hsp = {}, {}, {} 264 for line in aln_header: 265 # query line 266 if line.startswith('Query:'): 267 qresult['id'], qresult['description'] = \ 268 _parse_hit_or_query_line(line) 269 # target line 270 elif line.startswith('Target:'): 271 hit['id'], hit['description'] = \ 272 _parse_hit_or_query_line(line) 273 # model line 274 elif line.startswith('Model:'): 275 qresult['model'] = line.split(' ', 1)[1] 276 # score line 277 elif line.startswith('Raw score:'): 278 hsp['score'] = line.split(' ', 2)[2] 279 # query range line 280 elif line.startswith('Query range:'): 281 # line is always 'Query range: \d+ -> \d+', so we can pluck 282 # the numbers directly 283 hsp['query_start'], hsp['query_end'] = line.split(' ', 4)[2:5:2] 284 # hit range line 285 elif line.startswith('Target range:'): 286 # same logic with query range 287 hsp['hit_start'], hsp['hit_end'] = line.split(' ', 4)[2:5:2] 288 289 # determine strand 290 if qresult['description'].endswith(':[revcomp]'): 291 hsp['query_strand'] = '-' 292 qresult['description'] = qresult['description'].replace(':[revcomp]', '') 293 elif 'protein2' in qresult['model']: 294 hsp['query_strand'] = '.' 295 else: 296 hsp['query_strand'] = '+' 297 if hit['description'].endswith(':[revcomp]'): 298 hsp['hit_strand'] = '-' 299 hit['description'] = hit['description'].replace(':[revcomp]', '') 300 elif '2protein' in qresult['model']: 301 hsp['hit_strand'] = '.' 302 else: 303 hsp['hit_strand'] = '+' 304 305 # NOTE: we haven't processed the coordinates types 306 # and the strands are not yet Biopython's standard (1 / -1 / 0) 307 # since it's easier if we do the conversion later 308 309 return {'qresult': qresult, 'hit': hit, 'hsp': hsp}
310
311 - def _parse_qresult(self):
312 # state values 313 state_EOF = 0 314 state_QRES_NEW = 1 315 state_QRES_SAME = 3 316 state_HIT_NEW = 2 317 state_HIT_SAME = 4 318 # initial dummies 319 qres_state, hit_state = None, None 320 file_state = None 321 prev_qid, prev_hid = None, None 322 cur, prev = None, None 323 hit_list, hsp_list = [], [] 324 # if the file has c4 alignments, use that as the alignment mark 325 if self.has_c4_alignment: 326 self._ALN_MARK = 'C4 Alignment:' 327 328 while True: 329 self.read_until(lambda line: line.startswith(self._ALN_MARK)) 330 if cur is not None: 331 prev = cur 332 prev_qid = cur_qid 333 prev_hid = cur_hid 334 # only parse the result row if it's not EOF 335 if self.line: 336 assert self.line.startswith(self._ALN_MARK), self.line 337 # create temp dicts for storing parsed values 338 header = {'qresult': {}, 'hit': {}, 'hsp': {}} 339 # if the file has c4 alignments, try to parse the header 340 if self.has_c4_alignment: 341 self.read_until(lambda line: 342 line.strip().startswith('Query:')) 343 header = self._parse_alignment_header() 344 # parse the block contents 345 cur = self.parse_alignment_block(header) 346 cur_qid = cur['qresult']['id'] 347 cur_hid = cur['hit']['id'] 348 elif not self.line or self.line.startswith('-- completed '): 349 file_state = state_EOF 350 cur_qid, cur_hid = None, None 351 352 # get the state of hit and qresult 353 if prev_qid != cur_qid: 354 qres_state = state_QRES_NEW 355 else: 356 qres_state = state_QRES_SAME 357 # new hits are hits with different ids or hits in a new query 358 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 359 hit_state = state_HIT_NEW 360 else: 361 hit_state = state_HIT_SAME 362 363 if prev is not None: 364 hsp = _create_hsp(prev_hid, prev_qid, prev['hsp']) 365 hsp_list.append(hsp) 366 367 if hit_state == state_HIT_NEW: 368 hit = Hit(hsp_list) 369 for attr, value in prev['hit'].items(): 370 setattr(hit, attr, value) 371 hit_list.append(hit) 372 hsp_list = [] 373 374 if qres_state == state_QRES_NEW or file_state == state_EOF: 375 qresult = QueryResult(id=prev_qid) 376 for hit in hit_list: 377 # not using append since Exonerate may separate the 378 # same hit if it has different strands 379 qresult.absorb(hit) 380 for attr, value in prev['qresult'].items(): 381 setattr(qresult, attr, value) 382 yield qresult 383 if file_state == state_EOF: 384 break 385 hit_list = [] 386 387 # only readline() here if we're not parsing C4 alignments 388 # C4 alignments readline() is handled by its parse_alignment_block 389 # function 390 if not self.has_c4_alignment: 391 self.line = self.handle.readline()
392 393
394 -class _BaseExonerateIndexer(SearchIndexer):
395 396 """Indexer class for Exonerate plain text.""" 397 398 _parser = None # should be defined by subclass 399 _query_mark = None # this one too 400
401 - def get_qresult_id(self, pos):
402 raise NotImplementedError("Should be defined by subclass")
403
404 - def __iter__(self):
405 """Iterates over the file handle; yields key, start offset, and length.""" 406 handle = self._handle 407 handle.seek(0) 408 qresult_key = None 409 410 while True: 411 start_offset = handle.tell() 412 line = handle.readline() 413 if line.startswith(self._query_mark): 414 if qresult_key is None: 415 qresult_key = self.get_qresult_id(start_offset) 416 qresult_offset = start_offset 417 else: 418 curr_key = self.get_qresult_id(start_offset) 419 if curr_key != qresult_key: 420 yield qresult_key, qresult_offset, \ 421 start_offset - qresult_offset 422 qresult_key = curr_key 423 qresult_offset = start_offset 424 handle.seek(qresult_offset) 425 elif not line: 426 yield qresult_key, qresult_offset, \ 427 start_offset - qresult_offset 428 break
429 430 431 # if not used as a module, run the doctest 432 if __name__ == "__main__": 433 from Bio._utils import run_doctest 434 run_doctest() 435