Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_tab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_tab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ tab output format, with or without comments.""" 
  7   
  8  import re 
  9   
 10  from Bio._py3k import _as_bytes, _bytes_to_string 
 11  from Bio.SearchIO._index import SearchIndexer 
 12  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 13   
 14   
 15  __all__ = ['BlastTabIndexer', 'BlastTabParser', 'BlastTabWriter'] 
 16   
 17   
 18  # longname-shortname map 
 19  # maps the column names shown in a commented output to its short name 
 20  # (the one used in the command line) 
 21  _LONG_SHORT_MAP = { 
 22      'query id': 'qseqid', 
 23      'query acc.': 'qacc', 
 24      'query acc.ver': 'qaccver', 
 25      'query length': 'qlen', 
 26      'subject id': 'sseqid', 
 27      'subject acc.': 'sacc', 
 28      'subject acc.ver': 'saccver', 
 29      'subject length': 'slen', 
 30      'alignment length': 'length', 
 31      'bit score': 'bitscore', 
 32      'score': 'score', 
 33      'evalue': 'evalue', 
 34      'identical': 'nident', 
 35      '% identity': 'pident', 
 36      'positives': 'positive', 
 37      '% positives': 'ppos', 
 38      'mismatches': 'mismatch', 
 39      'gaps': 'gaps', 
 40      'q. start': 'qstart', 
 41      'q. end': 'qend', 
 42      's. start': 'sstart', 
 43      's. end': 'send', 
 44      'query frame': 'qframe', 
 45      'sbjct frame': 'sframe', 
 46      'query/sbjct frames': 'frames', 
 47      'query seq': 'qseq', 
 48      'subject seq': 'sseq', 
 49      'gap opens': 'gapopen', 
 50      'query gi': 'qgi', 
 51      'subject ids': 'sallseqid', 
 52      'subject gi': 'sgi', 
 53      'subject gis': 'sallgi', 
 54      'BTOP': 'btop', 
 55  } 
 56   
 57  # column to class attribute map 
 58  _COLUMN_QRESULT = { 
 59      'qseqid': ('id', str), 
 60      'qacc': ('accession', str), 
 61      'qaccver': ('accession_version', str), 
 62      'qlen': ('seq_len', int), 
 63      'qgi': ('gi', str), 
 64  } 
 65  _COLUMN_HIT = { 
 66      'sseqid': ('id', str), 
 67      'sallseqid': ('id_all', str), 
 68      'sacc': ('accession', str), 
 69      'saccver': ('accession_version', str), 
 70      'sgi': ('gi', str), 
 71      'sallgi': ('gi_all', str), 
 72      'slen': ('seq_len', int), 
 73  } 
 74  _COLUMN_HSP = { 
 75      'bitscore': ('bitscore', float), 
 76      'score': ('bitscore_raw', int), 
 77      'evalue': ('evalue', float), 
 78      'nident': ('ident_num', int), 
 79      'pident': ('ident_pct', float), 
 80      'positive': ('pos_num', int), 
 81      'ppos': ('pos_pct', float), 
 82      'mismatch': ('mismatch_num', int), 
 83      'gaps': ('gap_num', int), 
 84      'gapopen': ('gapopen_num', int), 
 85      'btop': ('btop', str), 
 86  } 
 87  _COLUMN_FRAG = { 
 88      'length': ('aln_span', int), 
 89      'qstart': ('query_start', int), 
 90      'qend': ('query_end', int), 
 91      'sstart': ('hit_start', int), 
 92      'send': ('hit_end', int), 
 93      'qframe': ('query_frame', int), 
 94      'sframe': ('hit_frame', int), 
 95      'frames': ('frames', str), 
 96      'qseq': ('query', str), 
 97      'sseq': ('hit', str), 
 98  } 
 99  _SUPPORTED_FIELDS = set(_COLUMN_QRESULT.keys() + _COLUMN_HIT.keys() + 
100          _COLUMN_HSP.keys() + _COLUMN_FRAG.keys()) 
101   
102  # column order in the non-commented tabular output variant 
103  # values must be keys inside the column-attribute maps above 
104  _DEFAULT_FIELDS = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
105          'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] 
106  # one field from each of the following sets must exist in order for the 
107  # parser to work 
108  _MIN_QUERY_FIELDS = set(['qseqid', 'qacc', 'qaccver']) 
109  _MIN_HIT_FIELDS = set(['sseqid', 'sacc', 'saccver']) 
110   
111  # simple function to create BLAST HSP attributes that may be computed if 
112  # other certain attributes are present 
113  # This was previously implemented in the HSP objects in the old model 
114   
115  _RE_GAPOPEN = re.compile(r'\w-') 
116   
117   
118 -def _compute_gapopen_num(hsp):
119 """Returns the number of gap openings in the given HSP.""" 120 gapopen = 0 121 for seq_type in ('query', 'hit'): 122 seq = str(getattr(hsp, seq_type).seq) 123 gapopen += len(re.findall(_RE_GAPOPEN, seq)) 124 return gapopen
125 126
127 -def _augment_blast_hsp(hsp, attr):
128 """Calculates the given HSP attribute, for writing.""" 129 if attr == 'aln_span': 130 # aln_span is number of identical matches + mismatches + gaps 131 func = lambda hsp: hsp.ident_num + hsp.mismatch_num + hsp.gap_num 132 # ident and gap will require the num values to be computed first 133 elif attr.startswith('ident'): 134 func = lambda hsp: hsp.aln_span - hsp.mismatch_num - hsp.gap_num 135 elif attr.startswith('gap'): 136 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.mismatch_num 137 elif attr == 'mismatch_num': 138 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.gap_num 139 elif attr == 'gapopen_num': 140 if not hasattr(hsp, 'query') or not hasattr(hsp, 'hit'): 141 # mock function so that the except clause below is triggered 142 # as both the query and hit are required to compute gapopen 143 def mock(hsp): 144 raise AttributeError
145 func = mock 146 else: 147 func = _compute_gapopen_num 148 149 # set the num values 150 # requires the endswith check, since we only want to set 'num' or 'span' 151 # attributes here 152 if not hasattr(hsp, attr) and not attr.endswith('_pct'): 153 value = func(hsp) 154 setattr(hsp, attr, value) 155 156 # if the attr is a percent value, calculate it 157 if attr == 'ident_pct': 158 func2 = lambda hsp: hsp.ident_num / float(hsp.aln_span) * 100 159 elif attr == 'pos_pct': 160 func = lambda hsp: hsp.pos_num / float(hsp.aln_span) * 100 161 elif attr == 'gap_pct': 162 func2 = lambda hsp: hsp.gap_num / float(hsp.aln_span) * 100 163 else: 164 func2 = None 165 166 # set the pct values 167 if func2 is not None: 168 value = func2(hsp) 169 setattr(hsp, attr, value) 170 171
172 -class BlastTabParser(object):
173 174 """Parser for the BLAST tabular format.""" 175
176 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
177 self.handle = handle 178 self.has_comments = comments 179 self.fields = self._prep_fields(fields) 180 self.line = self.handle.readline().strip()
181
182 - def __iter__(self):
183 # stop iteration if file has no lines 184 if not self.line: 185 raise StopIteration 186 # determine which iterator to use 187 elif self.has_comments: 188 iterfunc = self._parse_commented_qresult 189 else: 190 iterfunc = self._parse_qresult 191 192 for qresult in iterfunc(): 193 yield qresult
194
195 - def _prep_fields(self, fields):
196 """Validates and formats the given fields for use by the parser.""" 197 # cast into list if fields is a space-separated string 198 if isinstance(fields, basestring): 199 fields = fields.strip().split(' ') 200 # blast allows 'std' as a proxy for the standard default lists 201 # we want to transform 'std' to its proper column names 202 if 'std' in fields: 203 idx = fields.index('std') 204 fields = fields[:idx] + _DEFAULT_FIELDS + fields[idx+1:] 205 # if set(fields) has a null intersection with minimum required 206 # fields for hit and query, raise an exception 207 if not set(fields).intersection(_MIN_QUERY_FIELDS) or \ 208 not set(fields).intersection(_MIN_HIT_FIELDS): 209 raise ValueError("Required query and/or hit ID field not found.") 210 211 return fields
212
213 - def _parse_commented_qresult(self):
214 """Iterator returning `QueryResult` objects from a commented file.""" 215 while True: 216 comments = self._parse_comments() 217 if comments: 218 try: 219 self.fields = comments['fields'] 220 # iterator for the query results 221 qres_iter = self._parse_qresult() 222 except KeyError: 223 # no fields means the query has no results 224 assert 'fields' not in comments 225 # create an iterator returning one empty qresult 226 # if the query has no results 227 qres_iter = iter([QueryResult('')]) 228 229 for qresult in qres_iter: 230 for key, value in comments.items(): 231 setattr(qresult, key, value) 232 yield qresult 233 234 else: 235 break
236
237 - def _parse_comments(self):
238 """Returns a dictionary containing tab file comments.""" 239 comments = {} 240 while True: 241 # parse program and version 242 # example: # BLASTX 2.2.26+ 243 if 'BLAST' in self.line and 'processed' not in self.line: 244 program_line = self.line[len(' #'):].split(' ') 245 comments['program'] = program_line[0].lower() 246 comments['version'] = program_line[1] 247 # parse query id and description (if available) 248 # example: # Query: gi|356995852 Mus musculus POU domain 249 elif 'Query' in self.line: 250 query_line = self.line[len('# Query: '):].split(' ', 1) 251 comments['id'] = query_line[0] 252 if len(query_line) == 2: 253 comments['description'] = query_line[1] 254 # parse target database 255 # example: # Database: db/minirefseq_protein 256 elif 'Database' in self.line: 257 comments['target'] = self.line[len('# Database: '):] 258 # parse RID (from remote searches) 259 elif 'RID' in self.line: 260 comments['rid'] = self.line[len('# RID: '):] 261 # parse column order, required for parsing the result lines 262 # example: # Fields: query id, query gi, query acc., query length 263 elif 'Fields' in self.line: 264 comments['fields'] = self._parse_fields_line() 265 # if the line has these strings, it's either the end of a comment 266 # or the end of a file, so we return all the comments we've parsed 267 elif ' hits found' in self.line or 'processed' in self.line: 268 self.line = self.handle.readline().strip() 269 return comments 270 271 self.line = self.handle.readline() 272 273 if not self.line: 274 return comments 275 else: 276 self.line = self.line.strip()
277
278 - def _parse_fields_line(self):
279 """Returns a list of column short names from the 'Fields' 280 comment line.""" 281 raw_field_str = self.line[len('# Fields: '):] 282 long_fields = raw_field_str.split(', ') 283 fields = [_LONG_SHORT_MAP[long_name] for long_name in long_fields] 284 return self._prep_fields(fields)
285
286 - def _parse_result_row(self):
287 """Returns a dictionary of parsed row values.""" 288 fields = self.fields 289 columns = self.line.strip().split('\t') 290 assert len(fields) == len(columns), "Expected %i columns, found: " \ 291 "%i" % (len(fields), len(columns)) 292 293 qresult, hit, hsp, frag = {}, {}, {}, {} 294 for idx, value in enumerate(columns): 295 sname = fields[idx] 296 # flag to check if any of the _COLUMNs contain sname 297 in_mapping = False 298 # iterate over each dict, mapping pair to determine 299 # attribute name and value of each column 300 for parsed_dict, mapping in ( 301 (qresult, _COLUMN_QRESULT), 302 (hit, _COLUMN_HIT), 303 (hsp, _COLUMN_HSP), 304 (frag, _COLUMN_FRAG)): 305 # process parsed value according to mapping 306 if sname in mapping: 307 attr_name, caster = mapping[sname] 308 if caster is not str: 309 value = caster(value) 310 parsed_dict[attr_name] = value 311 in_mapping = True 312 # make sure that any unhandled field is not supported 313 if not in_mapping: 314 assert sname not in _SUPPORTED_FIELDS 315 316 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
317
318 - def _get_id(self, parsed):
319 """Returns the value used for a QueryResult or Hit ID from a parsed row.""" 320 # use 'id', with 'accession' and 'accession_version' fallbacks 321 # one of these must have a value since we've checked whether 322 # they exist or not when parsing the comments 323 id_cache = parsed.get('id') 324 if id_cache is None: 325 id_cache = parsed.get('accession') 326 if id_cache is None: 327 id_cache = parsed.get('accession_version') 328 329 return id_cache
330
331 - def _parse_qresult(self):
332 """Generator function that returns QueryResult objects.""" 333 # state values, used to determine what to do with each line 334 state_EOF = 0 335 state_QRES_NEW = 1 336 state_QRES_SAME = 3 337 state_HIT_NEW = 2 338 state_HIT_SAME = 4 339 # dummies for initial states 340 qres_state = None 341 hit_state = None 342 file_state = None 343 # dummies for initial id caches 344 prev_qid = None 345 prev_hid = None 346 # dummies for initial parsed value containers 347 cur, prev = None, None 348 hit_list, hsp_list = [], [] 349 350 while True: 351 # store previous line's parsed values if we've past the first line 352 if cur is not None: 353 prev = cur 354 prev_qid = cur_qid 355 prev_hid = cur_hid 356 # only parse the line if it's not EOF or not a comment line 357 if self.line and not self.line.startswith('#'): 358 cur = self._parse_result_row() 359 cur_qid = self._get_id(cur['qresult']) 360 cur_hid = self._get_id(cur['hit']) 361 else: 362 file_state = state_EOF 363 # mock values for cur_qid and cur_hid since the line is empty 364 cur_qid, cur_hid = None, None 365 366 # get the state of hit and qresult 367 if prev_qid != cur_qid: 368 qres_state = state_QRES_NEW 369 else: 370 qres_state = state_QRES_SAME 371 # new hits are hits with different id or hits in a new qresult 372 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 373 hit_state = state_HIT_NEW 374 else: 375 hit_state = state_HIT_SAME 376 377 # we're creating objects for the previously parsed line(s), 378 # so nothing is done in the first parsed line (prev == None) 379 if prev is not None: 380 # every line is essentially an HSP with one fragment, so we 381 # create both of these for every line 382 frag = HSPFragment(prev_hid, prev_qid) 383 for attr, value in prev['frag'].items(): 384 # adjust coordinates to Python range 385 # NOTE: this requires both start and end coords to be 386 # present, otherwise a KeyError will be raised. 387 # Without this limitation, we might misleadingly set the 388 # start / end coords 389 for seq_type in ('query', 'hit'): 390 if attr == seq_type + '_start': 391 value = min(value, 392 prev['frag'][seq_type + '_end']) - 1 393 elif attr == seq_type + '_end': 394 value = max(value, 395 prev['frag'][seq_type + '_start']) 396 setattr(frag, attr, value) 397 # strand and frame setattr require the full parsed values 398 # to be set first 399 for seq_type in ('hit', 'query'): 400 # try to set hit and query frame 401 frame = self._get_frag_frame(frag, seq_type, 402 prev['frag']) 403 setattr(frag, '%s_frame' % seq_type, frame) 404 # try to set hit and query strand 405 strand = self._get_frag_strand(frag, seq_type, 406 prev['frag']) 407 setattr(frag, '%s_strand' % seq_type, strand) 408 409 hsp = HSP([frag]) 410 for attr, value in prev['hsp'].items(): 411 setattr(hsp, attr, value) 412 hsp_list.append(hsp) 413 414 # create hit and append to temp hit container if hit_state 415 # says we're not at the same hit or at a new query 416 if hit_state == state_HIT_NEW: 417 hit = Hit(hsp_list) 418 for attr, value in prev['hit'].items(): 419 setattr(hit, attr, value) 420 hit_list.append(hit) 421 hsp_list = [] 422 # create qresult and yield if we're at a new qresult or EOF 423 if qres_state == state_QRES_NEW or file_state == state_EOF: 424 qresult = QueryResult(prev_qid, hits=hit_list) 425 for attr, value in prev['qresult'].items(): 426 setattr(qresult, attr, value) 427 yield qresult 428 # if current line is EOF, break 429 if file_state == state_EOF: 430 break 431 hit_list = [] 432 433 self.line = self.handle.readline().strip()
434
435 - def _get_frag_frame(self, frag, seq_type, parsedict):
436 """Returns `HSPFragment` frame given the object, its sequence type, 437 and its parsed dictionary values.""" 438 assert seq_type in ('query', 'hit') 439 frame = getattr(frag, '%s_frame' % seq_type, None) 440 if frame is not None: 441 return frame 442 else: 443 if 'frames' in parsedict: 444 # frames is 'x1/x2' string, x1 is query frame, x2 is hit frame 445 idx = 0 if seq_type == 'query' else 1 446 return int(parsedict['frames'].split('/')[idx])
447 # else implicit None return 448
449 - def _get_frag_strand(self, frag, seq_type, parsedict):
450 """Returns `HSPFragment` strand given the object, its sequence type, 451 and its parsed dictionary values.""" 452 # NOTE: this will never set the strands as 0 for protein 453 # queries / hits, since we can't detect the blast flavors 454 # from the columns alone. 455 assert seq_type in ('query', 'hit') 456 strand = getattr(frag, '%s_strand' % seq_type, None) 457 if strand is not None: 458 return strand 459 else: 460 # using parsedict instead of the fragment object since 461 # we need the unadjusted coordinated values 462 start = parsedict.get('%s_start' % seq_type) 463 end = parsedict.get('%s_end' % seq_type) 464 if start is not None and end is not None: 465 return 1 if start <= end else -1
466 # else implicit None return 467 468
469 -class BlastTabIndexer(SearchIndexer):
470 471 """Indexer class for BLAST+ tab output.""" 472 473 _parser = BlastTabParser 474
475 - def __init__(self, filename, comments=False, fields=_DEFAULT_FIELDS):
476 SearchIndexer.__init__(self, filename, comments=comments, fields=fields) 477 478 # if the file doesn't have comments, 479 # get index of column used as the key (qseqid / qacc / qaccver) 480 if not self._kwargs['comments']: 481 if 'qseqid' in fields: 482 self._key_idx = fields.index('qseqid') 483 elif 'qacc' in fields: 484 self._key_idx = fields.index('qacc') 485 elif 'qaccver' in fields: 486 self._key_idx = fields.index('qaccver') 487 else: 488 raise ValueError("Custom fields is missing an ID column. " 489 "One of these must be present: 'qseqid', 'qacc', or 'qaccver'.")
490
491 - def __iter__(self):
492 """Iterates over the file handle; yields key, start offset, and length.""" 493 handle = self._handle 494 handle.seek(0) 495 496 if not self._kwargs['comments']: 497 iterfunc = self._qresult_index 498 else: 499 iterfunc = self._qresult_index_commented 500 501 for key, offset, length in iterfunc(): 502 yield _bytes_to_string(key), offset, length
503
504 - def _qresult_index_commented(self):
505 """Indexer for commented BLAST tabular files.""" 506 handle = self._handle 507 handle.seek(0) 508 start_offset = 0 509 # mark of a new query 510 query_mark = None 511 # mark of the query's ID 512 qid_mark = _as_bytes('# Query: ') 513 # mark of the last line 514 end_mark = _as_bytes('# BLAST processed') 515 516 while True: 517 end_offset = handle.tell() 518 line = handle.readline() 519 520 if query_mark is None: 521 query_mark = line 522 start_offset = end_offset 523 elif line.startswith(qid_mark): 524 qresult_key = line[len(qid_mark):].split()[0] 525 elif line == query_mark or line.startswith(end_mark): 526 yield qresult_key, start_offset, end_offset - start_offset 527 start_offset = end_offset 528 elif not line: 529 break
530
531 - def _qresult_index(self):
532 """Indexer for noncommented BLAST tabular files.""" 533 handle = self._handle 534 handle.seek(0) 535 start_offset = 0 536 qresult_key = None 537 key_idx = self._key_idx 538 tab_char = _as_bytes('\t') 539 540 while True: 541 # get end offset here since we only know a qresult ends after 542 # encountering the next one 543 end_offset = handle.tell() 544 #line = handle.readline() 545 line = handle.readline() 546 547 if qresult_key is None: 548 qresult_key = line.split(tab_char)[key_idx] 549 else: 550 try: 551 curr_key = line.split(tab_char)[key_idx] 552 except IndexError: 553 curr_key = _as_bytes('') 554 555 if curr_key != qresult_key: 556 yield qresult_key, start_offset, end_offset - start_offset 557 qresult_key = curr_key 558 start_offset = end_offset 559 560 # break if we've reached EOF 561 if not line: 562 break
563
564 - def get_raw(self, offset):
565 """Returns the raw string of a QueryResult object from the given offset.""" 566 if self._kwargs['comments']: 567 getfunc = self._get_raw_qresult_commented 568 else: 569 getfunc = self._get_raw_qresult 570 571 return getfunc(offset)
572
573 - def _get_raw_qresult(self, offset):
574 """Returns the raw string of a single QueryResult from a noncommented file.""" 575 handle = self._handle 576 handle.seek(offset) 577 qresult_raw = _as_bytes('') 578 tab_char = _as_bytes('\t') 579 key_idx = self._key_idx 580 qresult_key = None 581 582 while True: 583 line = handle.readline() 584 # get the key if the first line (qresult key) 585 if qresult_key is None: 586 qresult_key = line.split(tab_char)[key_idx] 587 else: 588 try: 589 curr_key = line.split(tab_char)[key_idx] 590 except IndexError: 591 curr_key = _as_bytes('') 592 # only break when qresult is finished (key is different) 593 if curr_key != qresult_key: 594 break 595 # append to the raw string as long as qresult is the same 596 qresult_raw += line 597 598 return qresult_raw
599
600 - def _get_raw_qresult_commented(self, offset):
601 """Returns the raw string of a single QueryResult from a commented file.""" 602 handle = self._handle 603 handle.seek(offset) 604 qresult_raw = _as_bytes('') 605 end_mark = _as_bytes('# BLAST processed') 606 607 # query mark is the line marking a new query 608 # something like '# TBLASTN 2.2.25+' 609 query_mark = None 610 line = handle.readline() 611 while line: 612 # since query_mark depends on the BLAST search, we need to obtain it 613 # first 614 if query_mark is None: 615 query_mark = line 616 # break when we've reached the next qresult or the search ends 617 elif line == query_mark or line.startswith(end_mark): 618 break 619 620 qresult_raw += line 621 line = handle.readline() 622 623 return qresult_raw
624 625
626 -class BlastTabWriter(object):
627 628 """Writer for blast-tab output format.""" 629
630 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
631 self.handle = handle 632 self.has_comments = comments 633 self.fields = fields
634
635 - def write_file(self, qresults):
636 """Writes to the handle, returns how many QueryResult objects are written.""" 637 handle = self.handle 638 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 639 640 for qresult in qresults: 641 if self.has_comments: 642 handle.write(self._build_comments(qresult)) 643 if qresult: 644 handle.write(self._build_rows(qresult)) 645 if not self.has_comments: 646 qresult_counter += 1 647 hit_counter += len(qresult) 648 hsp_counter += sum([len(hit) for hit in qresult]) 649 frag_counter += sum([len(hit.fragments) for hit in qresult]) 650 # if it's commented and there are no hits in the qresult, we still 651 # increment the counter 652 if self.has_comments: 653 qresult_counter += 1 654 655 # commented files have a line saying how many queries were processed 656 if self.has_comments: 657 handle.write('# BLAST processed %i queries' % qresult_counter) 658 659 return qresult_counter, hit_counter, hsp_counter, frag_counter
660
661 - def _build_rows(self, qresult):
662 """Returns a string containing tabular rows of the QueryResult object.""" 663 coordinates = set(['qstart', 'qend', 'sstart', 'send']) 664 qresult_lines = '' 665 for hit in qresult: 666 for hsp in hit: 667 line = [] 668 for field in self.fields: 669 # get the column value ~ could either be an attribute 670 # of qresult, hit, or hsp 671 if field in _COLUMN_QRESULT: 672 value = getattr(qresult, _COLUMN_QRESULT[field][0]) 673 elif field in _COLUMN_HIT: 674 value = getattr(hit, _COLUMN_HIT[field][0]) 675 # special case, since 'frames' can be determined from 676 # query frame and hit frame 677 elif field == 'frames': 678 value = '%i/%i' % (hsp.query_frame, hsp.hit_frame) 679 elif field in _COLUMN_HSP: 680 try: 681 value = getattr(hsp, _COLUMN_HSP[field][0]) 682 except AttributeError: 683 attr = _COLUMN_HSP[field][0] 684 _augment_blast_hsp(hsp, attr) 685 value = getattr(hsp, attr) 686 elif field in _COLUMN_FRAG: 687 value = getattr(hsp, _COLUMN_FRAG[field][0]) 688 else: 689 assert field not in _SUPPORTED_FIELDS 690 continue 691 692 # adjust from and to according to strand, if from and to 693 # is included in the output field 694 if field in coordinates: 695 value = self._adjust_coords(field, value, hsp) 696 # adjust output formatting 697 value = self._adjust_output(field, value) 698 699 line.append(value) 700 701 hsp_line = '\t'.join(line) 702 qresult_lines += hsp_line + '\n' 703 704 return qresult_lines
705
706 - def _adjust_coords(self, field, value, hsp):
707 """Adjusts start and end coordinates according to strand.""" 708 assert field in ('qstart', 'qend', 'sstart', 'send') 709 # determine sequence type to operate on based on field's first letter 710 seq_type = 'query' if field.startswith('q') else 'hit' 711 712 strand = getattr(hsp, '%s_strand' % seq_type, None) 713 if strand is None: 714 raise ValueError("Required attribute %r not found." % 715 ('%s_strand' % (seq_type))) 716 # switch start <--> end coordinates if strand is -1 717 if strand < 0: 718 if field.endswith('start'): 719 value = getattr(hsp, '%s_end' % seq_type) 720 elif field.endswith('end'): 721 value = getattr(hsp, '%s_start' % seq_type) + 1 722 elif field.endswith('start'): 723 # adjust start coordinate for positive strand 724 value += 1 725 726 return value
727
728 - def _adjust_output(self, field, value):
729 """Adjusts formatting of the given field and value to mimic native tab output.""" 730 731 # evalue formatting, adapted from BLAST+ source: 732 # src/objtools/align_format/align_format_util.cpp#L668 733 if field == 'evalue': 734 if value < 1.0e-180: 735 value = '0.0' 736 elif value < 1.0e-99: 737 value = '%2.0e' % value 738 elif value < 0.0009: 739 value = '%3.0e' % value 740 elif value < 0.1: 741 value = '%4.3f' % value 742 elif value < 1.0: 743 value = '%3.2f' % value 744 elif value < 10.0: 745 value = '%2.1f' % value 746 else: 747 value = '%5.0f' % value 748 749 # pident and ppos formatting 750 elif field in ('pident', 'ppos'): 751 value = '%.2f' % value 752 753 # evalue formatting, adapted from BLAST+ source: 754 # src/objtools/align_format/align_format_util.cpp#L723 755 elif field == 'bitscore': 756 if value > 9999: 757 value = '%4.3e' % value 758 elif value > 99.9: 759 value = '%4.0d' % value 760 else: 761 value = '%4.1f' % value 762 763 # everything else 764 else: 765 value = str(value) 766 767 return value
768
769 - def _build_comments(self, qres):
770 """Returns a string of a QueryResult tabular comment.""" 771 comments = [] 772 # inverse mapping of the long-short name map, required 773 # for writing comments 774 inv_field_map = dict((value, key) for key, value in 775 _LONG_SHORT_MAP.items()) 776 777 # try to anticipate qress without version 778 if not hasattr(qres, 'version'): 779 program_line = '# %s' % qres.program.upper() 780 else: 781 program_line = '# %s %s' % (qres.program.upper(), qres.version) 782 comments.append(program_line) 783 # description may or may not be present, so we'll do a try here 784 try: 785 comments.append('# Query: %s %s' % (qres.id, qres.description)) 786 except AttributeError: 787 comments.append('# Query: %s' % qres.id) 788 # try appending RID line, if present 789 try: 790 comments.append('# RID: %s' % qres.rid) 791 except AttributeError: 792 pass 793 comments.append('# Database: %s' % qres.target) 794 # qresults without hits don't show the Fields comment 795 if qres: 796 comments.append('# Fields: %s' % 797 ', '.join([inv_field_map[field] for field in self.fields])) 798 comments.append('# %i hits found' % len(qres)) 799 800 return '\n'.join(comments) + '\n'
801 802 803 # if not used as a module, run the doctest 804 if __name__ == "__main__": 805 from Bio._utils import run_doctest 806 run_doctest() 807