Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_tab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_tab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ tab output format, with or without comments.""" 
  7   
  8  import re 
  9   
 10  from Bio._py3k import _as_bytes, _bytes_to_string 
 11  from Bio._py3k import basestring 
 12   
 13  from Bio.SearchIO._index import SearchIndexer 
 14  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 15   
 16   
 17  __all__ = ['BlastTabIndexer', 'BlastTabParser', 'BlastTabWriter'] 
 18   
 19  __docformat__ = "restructuredtext en" 
 20   
 21   
 22  # longname-shortname map 
 23  # maps the column names shown in a commented output to its short name 
 24  # (the one used in the command line) 
 25  _LONG_SHORT_MAP = { 
 26      'query id': 'qseqid', 
 27      'query acc.': 'qacc', 
 28      'query acc.ver': 'qaccver', 
 29      'query length': 'qlen', 
 30      'subject id': 'sseqid', 
 31      'subject acc.': 'sacc', 
 32      'subject acc.ver': 'saccver', 
 33      'subject length': 'slen', 
 34      'alignment length': 'length', 
 35      'bit score': 'bitscore', 
 36      'score': 'score', 
 37      'evalue': 'evalue', 
 38      'identical': 'nident', 
 39      '% identity': 'pident', 
 40      'positives': 'positive', 
 41      '% positives': 'ppos', 
 42      'mismatches': 'mismatch', 
 43      'gaps': 'gaps', 
 44      'q. start': 'qstart', 
 45      'q. end': 'qend', 
 46      's. start': 'sstart', 
 47      's. end': 'send', 
 48      'query frame': 'qframe', 
 49      'sbjct frame': 'sframe', 
 50      'query/sbjct frames': 'frames', 
 51      'query seq': 'qseq', 
 52      'subject seq': 'sseq', 
 53      'gap opens': 'gapopen', 
 54      'query gi': 'qgi', 
 55      'subject ids': 'sallseqid', 
 56      'subject gi': 'sgi', 
 57      'subject gis': 'sallgi', 
 58      'BTOP': 'btop', 
 59      'subject accs.': 'sallacc', 
 60      'subject tax ids': 'staxids', 
 61      'subject sci names': 'sscinames', 
 62      'subject com names': 'scomnames', 
 63      'subject blast names': 'sblastnames', 
 64      'subject super kingdoms': 'sskingdoms', 
 65      'subject title': 'stitle', 
 66      'subject titles': 'salltitles', 
 67      'subject strand': 'sstrand', 
 68      '% subject coverage': 'qcovs', 
 69      '% hsp coverage': 'qcovhsp', 
 70  } 
 71   
 72  # function to create a list from semicolon-delimited string 
 73  # used in BlastTabParser._parse_result_row 
 74  _list_semicol = lambda x: x.split(';') 
 75  _list_diamond = lambda x: x.split('<>') 
 76  # column to class attribute map 
 77  _COLUMN_QRESULT = { 
 78      'qseqid': ('id', str), 
 79      'qacc': ('accession', str), 
 80      'qaccver': ('accession_version', str), 
 81      'qlen': ('seq_len', int), 
 82      'qgi': ('gi', str), 
 83  } 
 84  _COLUMN_HIT = { 
 85      'sseqid': ('id', str), 
 86      'sallseqid': ('id_all', _list_semicol), 
 87      'sacc': ('accession', str), 
 88      'saccver': ('accession_version', str), 
 89      'sallacc': ('accession_all', _list_semicol), 
 90      'sgi': ('gi', str), 
 91      'sallgi': ('gi_all', str), 
 92      'slen': ('seq_len', int), 
 93      'staxids': ('tax_ids', _list_semicol), 
 94      'sscinames': ('sci_names', _list_semicol), 
 95      'scomnames': ('com_names', _list_semicol), 
 96      'sblastnames': ('blast_names', _list_semicol), 
 97      'sskingdoms': ('super_kingdoms', _list_semicol), 
 98      'stitle': ('title', str), 
 99      'salltitles': ('title_all', _list_diamond), 
100      # set strand as HSP property? 
101      'sstrand': ('strand', str), 
102      'qcovs': ('query_coverage', float), 
103  } 
104  _COLUMN_HSP = { 
105      'bitscore': ('bitscore', float), 
106      'score': ('bitscore_raw', int), 
107      'evalue': ('evalue', float), 
108      'nident': ('ident_num', int), 
109      'pident': ('ident_pct', float), 
110      'positive': ('pos_num', int), 
111      'ppos': ('pos_pct', float), 
112      'mismatch': ('mismatch_num', int), 
113      'gaps': ('gap_num', int), 
114      'gapopen': ('gapopen_num', int), 
115      'btop': ('btop', str), 
116      'qcovhsp': ('query_coverage', float), 
117  } 
118  _COLUMN_FRAG = { 
119      'length': ('aln_span', int), 
120      'qstart': ('query_start', int), 
121      'qend': ('query_end', int), 
122      'sstart': ('hit_start', int), 
123      'send': ('hit_end', int), 
124      'qframe': ('query_frame', int), 
125      'sframe': ('hit_frame', int), 
126      'frames': ('frames', str), 
127      'qseq': ('query', str), 
128      'sseq': ('hit', str), 
129  } 
130  _SUPPORTED_FIELDS = set(list(_COLUMN_QRESULT) + list(_COLUMN_HIT) + 
131                          list(_COLUMN_HSP) + list(_COLUMN_FRAG)) 
132   
133  # column order in the non-commented tabular output variant 
134  # values must be keys inside the column-attribute maps above 
135  _DEFAULT_FIELDS = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
136          'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] 
137  # one field from each of the following sets must exist in order for the 
138  # parser to work 
139  _MIN_QUERY_FIELDS = set(['qseqid', 'qacc', 'qaccver']) 
140  _MIN_HIT_FIELDS = set(['sseqid', 'sacc', 'saccver', 'sallseqid']) 
141   
142  # simple function to create BLAST HSP attributes that may be computed if 
143  # other certain attributes are present 
144  # This was previously implemented in the HSP objects in the old model 
145   
146  _RE_GAPOPEN = re.compile(r'\w-') 
147   
148   
149 -def _compute_gapopen_num(hsp):
150 """Returns the number of gap openings in the given HSP.""" 151 gapopen = 0 152 for seq_type in ('query', 'hit'): 153 seq = str(getattr(hsp, seq_type).seq) 154 gapopen += len(re.findall(_RE_GAPOPEN, seq)) 155 return gapopen
156 157
158 -def _augment_blast_hsp(hsp, attr):
159 """Calculates the given HSP attribute, for writing.""" 160 if attr == 'aln_span': 161 # aln_span is number of identical matches + mismatches + gaps 162 func = lambda hsp: hsp.ident_num + hsp.mismatch_num + hsp.gap_num 163 # ident and gap will require the num values to be computed first 164 elif attr.startswith('ident'): 165 func = lambda hsp: hsp.aln_span - hsp.mismatch_num - hsp.gap_num 166 elif attr.startswith('gap'): 167 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.mismatch_num 168 elif attr == 'mismatch_num': 169 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.gap_num 170 elif attr == 'gapopen_num': 171 if not hasattr(hsp, 'query') or not hasattr(hsp, 'hit'): 172 # mock function so that the except clause below is triggered 173 # as both the query and hit are required to compute gapopen 174 def mock(hsp): 175 raise AttributeError
176 func = mock 177 else: 178 func = _compute_gapopen_num 179 180 # set the num values 181 # requires the endswith check, since we only want to set 'num' or 'span' 182 # attributes here 183 if not hasattr(hsp, attr) and not attr.endswith('_pct'): 184 value = func(hsp) 185 setattr(hsp, attr, value) 186 187 # if the attr is a percent value, calculate it 188 if attr == 'ident_pct': 189 func2 = lambda hsp: hsp.ident_num / float(hsp.aln_span) * 100 190 elif attr == 'pos_pct': 191 func = lambda hsp: hsp.pos_num / float(hsp.aln_span) * 100 192 elif attr == 'gap_pct': 193 func2 = lambda hsp: hsp.gap_num / float(hsp.aln_span) * 100 194 else: 195 func2 = None 196 197 # set the pct values 198 if func2 is not None: 199 value = func2(hsp) 200 setattr(hsp, attr, value) 201 202
203 -class BlastTabParser(object):
204 205 """Parser for the BLAST tabular format.""" 206
207 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
208 self.handle = handle 209 self.has_comments = comments 210 self.fields = self._prep_fields(fields) 211 self.line = self.handle.readline().strip()
212
213 - def __iter__(self):
214 # stop iteration if file has no lines 215 if not self.line: 216 raise StopIteration 217 # determine which iterator to use 218 elif self.has_comments: 219 iterfunc = self._parse_commented_qresult 220 else: 221 iterfunc = self._parse_qresult 222 223 for qresult in iterfunc(): 224 yield qresult
225
226 - def _prep_fields(self, fields):
227 """Validates and formats the given fields for use by the parser.""" 228 # cast into list if fields is a space-separated string 229 if isinstance(fields, basestring): 230 fields = fields.strip().split(' ') 231 # blast allows 'std' as a proxy for the standard default lists 232 # we want to transform 'std' to its proper column names 233 if 'std' in fields: 234 idx = fields.index('std') 235 fields = fields[:idx] + _DEFAULT_FIELDS + fields[idx + 1:] 236 # if set(fields) has a null intersection with minimum required 237 # fields for hit and query, raise an exception 238 if not set(fields).intersection(_MIN_QUERY_FIELDS) or \ 239 not set(fields).intersection(_MIN_HIT_FIELDS): 240 raise ValueError("Required query and/or hit ID field not found.") 241 242 return fields
243
244 - def _parse_commented_qresult(self):
245 """Iterator returning `QueryResult` objects from a commented file.""" 246 while True: 247 comments = self._parse_comments() 248 if comments: 249 try: 250 self.fields = comments['fields'] 251 # iterator for the query results 252 qres_iter = self._parse_qresult() 253 except KeyError: 254 # no fields means the query has no results 255 assert 'fields' not in comments 256 # create an iterator returning one empty qresult 257 # if the query has no results 258 qres_iter = iter([QueryResult()]) 259 260 for qresult in qres_iter: 261 for key, value in comments.items(): 262 setattr(qresult, key, value) 263 yield qresult 264 265 else: 266 break
267
268 - def _parse_comments(self):
269 """Returns a dictionary containing tab file comments.""" 270 comments = {} 271 while True: 272 # parse program and version 273 # example: # BLASTX 2.2.26+ 274 if 'BLAST' in self.line and 'processed' not in self.line: 275 program_line = self.line[len(' #'):].split(' ') 276 comments['program'] = program_line[0].lower() 277 comments['version'] = program_line[1] 278 # parse query id and description (if available) 279 # example: # Query: gi|356995852 Mus musculus POU domain 280 elif 'Query' in self.line: 281 query_line = self.line[len('# Query: '):].split(' ', 1) 282 comments['id'] = query_line[0] 283 if len(query_line) == 2: 284 comments['description'] = query_line[1] 285 # parse target database 286 # example: # Database: db/minirefseq_protein 287 elif 'Database' in self.line: 288 comments['target'] = self.line[len('# Database: '):] 289 # parse RID (from remote searches) 290 elif 'RID' in self.line: 291 comments['rid'] = self.line[len('# RID: '):] 292 # parse column order, required for parsing the result lines 293 # example: # Fields: query id, query gi, query acc., query length 294 elif 'Fields' in self.line: 295 comments['fields'] = self._parse_fields_line() 296 # if the line has these strings, it's either the end of a comment 297 # or the end of a file, so we return all the comments we've parsed 298 elif ' hits found' in self.line or 'processed' in self.line: 299 self.line = self.handle.readline().strip() 300 return comments 301 302 self.line = self.handle.readline() 303 304 if not self.line: 305 return comments 306 else: 307 self.line = self.line.strip()
308
309 - def _parse_fields_line(self):
310 """Returns a list of column short names from the 'Fields' 311 comment line.""" 312 raw_field_str = self.line[len('# Fields: '):] 313 long_fields = raw_field_str.split(', ') 314 fields = [_LONG_SHORT_MAP[long_name] for long_name in long_fields] 315 return self._prep_fields(fields)
316
317 - def _parse_result_row(self):
318 """Returns a dictionary of parsed row values.""" 319 fields = self.fields 320 columns = self.line.strip().split('\t') 321 assert len(fields) == len(columns), "Expected %i columns, found: " \ 322 "%i" % (len(fields), len(columns)) 323 324 qresult, hit, hsp, frag = {}, {}, {}, {} 325 for idx, value in enumerate(columns): 326 sname = fields[idx] 327 # flag to check if any of the _COLUMNs contain sname 328 in_mapping = False 329 # iterate over each dict, mapping pair to determine 330 # attribute name and value of each column 331 for parsed_dict, mapping in ( 332 (qresult, _COLUMN_QRESULT), 333 (hit, _COLUMN_HIT), 334 (hsp, _COLUMN_HSP), 335 (frag, _COLUMN_FRAG)): 336 # process parsed value according to mapping 337 if sname in mapping: 338 attr_name, caster = mapping[sname] 339 if caster is not str: 340 value = caster(value) 341 parsed_dict[attr_name] = value 342 in_mapping = True 343 # make sure that any unhandled field is not supported 344 if not in_mapping: 345 assert sname not in _SUPPORTED_FIELDS 346 347 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
348
349 - def _get_id(self, parsed):
350 """Returns the value used for a QueryResult or Hit ID from a parsed row.""" 351 # use 'id', with 'id_all', 'accession' and 'accession_version' 352 # fallbacks one of these must have a value since we've checked whether 353 # they exist or not when parsing the comments 354 id_cache = parsed.get('id') 355 if id_cache is None and 'id_all' in parsed: 356 id_cache = parsed.get('id_all')[0] 357 if id_cache is None: 358 id_cache = parsed.get('accession') 359 if id_cache is None: 360 id_cache = parsed.get('accession_version') 361 362 return id_cache
363
364 - def _parse_qresult(self):
365 """Generator function that returns QueryResult objects.""" 366 # state values, used to determine what to do with each line 367 state_EOF = 0 368 state_QRES_NEW = 1 369 state_QRES_SAME = 3 370 state_HIT_NEW = 2 371 state_HIT_SAME = 4 372 # dummies for initial states 373 qres_state = None 374 hit_state = None 375 file_state = None 376 cur_qid = None 377 cur_hid = None 378 # dummies for initial id caches 379 prev_qid = None 380 prev_hid = None 381 # dummies for initial parsed value containers 382 cur, prev = None, None 383 hit_list, hsp_list = [], [] 384 385 while True: 386 # store previous line's parsed values if we've past the first line 387 if cur is not None: 388 prev = cur 389 prev_qid = cur_qid 390 prev_hid = cur_hid 391 # only parse the line if it's not EOF or not a comment line 392 if self.line and not self.line.startswith('#'): 393 cur = self._parse_result_row() 394 cur_qid = self._get_id(cur['qresult']) 395 cur_hid = self._get_id(cur['hit']) 396 else: 397 file_state = state_EOF 398 # mock values for cur_qid and cur_hid since the line is empty 399 cur_qid, cur_hid = None, None 400 401 # get the state of hit and qresult 402 if prev_qid != cur_qid: 403 qres_state = state_QRES_NEW 404 else: 405 qres_state = state_QRES_SAME 406 # new hits are hits with different id or hits in a new qresult 407 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 408 hit_state = state_HIT_NEW 409 else: 410 hit_state = state_HIT_SAME 411 412 # we're creating objects for the previously parsed line(s), 413 # so nothing is done in the first parsed line (prev == None) 414 if prev is not None: 415 # every line is essentially an HSP with one fragment, so we 416 # create both of these for every line 417 frag = HSPFragment(prev_hid, prev_qid) 418 for attr, value in prev['frag'].items(): 419 # adjust coordinates to Python range 420 # NOTE: this requires both start and end coords to be 421 # present, otherwise a KeyError will be raised. 422 # Without this limitation, we might misleadingly set the 423 # start / end coords 424 for seq_type in ('query', 'hit'): 425 if attr == seq_type + '_start': 426 value = min(value, 427 prev['frag'][seq_type + '_end']) - 1 428 elif attr == seq_type + '_end': 429 value = max(value, 430 prev['frag'][seq_type + '_start']) 431 setattr(frag, attr, value) 432 # strand and frame setattr require the full parsed values 433 # to be set first 434 for seq_type in ('hit', 'query'): 435 # try to set hit and query frame 436 frame = self._get_frag_frame(frag, seq_type, 437 prev['frag']) 438 setattr(frag, '%s_frame' % seq_type, frame) 439 # try to set hit and query strand 440 strand = self._get_frag_strand(frag, seq_type, 441 prev['frag']) 442 setattr(frag, '%s_strand' % seq_type, strand) 443 444 hsp = HSP([frag]) 445 for attr, value in prev['hsp'].items(): 446 setattr(hsp, attr, value) 447 hsp_list.append(hsp) 448 449 # create hit and append to temp hit container if hit_state 450 # says we're not at the same hit or at a new query 451 if hit_state == state_HIT_NEW: 452 hit = Hit(hsp_list) 453 for attr, value in prev['hit'].items(): 454 if attr != 'id_all': 455 setattr(hit, attr, value) 456 else: 457 # not setting hit ID since it's already set from the 458 # prev_hid above 459 setattr(hit, '_id_alt', value[1:]) 460 hit_list.append(hit) 461 hsp_list = [] 462 # create qresult and yield if we're at a new qresult or EOF 463 if qres_state == state_QRES_NEW or file_state == state_EOF: 464 qresult = QueryResult(hit_list, prev_qid) 465 for attr, value in prev['qresult'].items(): 466 setattr(qresult, attr, value) 467 yield qresult 468 # if current line is EOF, break 469 if file_state == state_EOF: 470 break 471 hit_list = [] 472 473 self.line = self.handle.readline().strip()
474
475 - def _get_frag_frame(self, frag, seq_type, parsedict):
476 """Returns `HSPFragment` frame given the object, its sequence type, 477 and its parsed dictionary values.""" 478 assert seq_type in ('query', 'hit') 479 frame = getattr(frag, '%s_frame' % seq_type, None) 480 if frame is not None: 481 return frame 482 else: 483 if 'frames' in parsedict: 484 # frames is 'x1/x2' string, x1 is query frame, x2 is hit frame 485 idx = 0 if seq_type == 'query' else 1 486 return int(parsedict['frames'].split('/')[idx])
487 # else implicit None return 488
489 - def _get_frag_strand(self, frag, seq_type, parsedict):
490 """Returns `HSPFragment` strand given the object, its sequence type, 491 and its parsed dictionary values.""" 492 # NOTE: this will never set the strands as 0 for protein 493 # queries / hits, since we can't detect the blast flavors 494 # from the columns alone. 495 assert seq_type in ('query', 'hit') 496 strand = getattr(frag, '%s_strand' % seq_type, None) 497 if strand is not None: 498 return strand 499 else: 500 # using parsedict instead of the fragment object since 501 # we need the unadjusted coordinated values 502 start = parsedict.get('%s_start' % seq_type) 503 end = parsedict.get('%s_end' % seq_type) 504 if start is not None and end is not None: 505 return 1 if start <= end else -1
506 # else implicit None return 507 508
509 -class BlastTabIndexer(SearchIndexer):
510 511 """Indexer class for BLAST+ tab output.""" 512 513 _parser = BlastTabParser 514
515 - def __init__(self, filename, comments=False, fields=_DEFAULT_FIELDS):
516 SearchIndexer.__init__(self, filename, comments=comments, fields=fields) 517 518 # if the file doesn't have comments, 519 # get index of column used as the key (qseqid / qacc / qaccver) 520 if not self._kwargs['comments']: 521 if 'qseqid' in fields: 522 self._key_idx = fields.index('qseqid') 523 elif 'qacc' in fields: 524 self._key_idx = fields.index('qacc') 525 elif 'qaccver' in fields: 526 self._key_idx = fields.index('qaccver') 527 else: 528 raise ValueError("Custom fields is missing an ID column. " 529 "One of these must be present: 'qseqid', 'qacc', or 'qaccver'.")
530
531 - def __iter__(self):
532 """Iterates over the file handle; yields key, start offset, and length.""" 533 handle = self._handle 534 handle.seek(0) 535 536 if not self._kwargs['comments']: 537 iterfunc = self._qresult_index 538 else: 539 iterfunc = self._qresult_index_commented 540 541 for key, offset, length in iterfunc(): 542 yield _bytes_to_string(key), offset, length
543
544 - def _qresult_index_commented(self):
545 """Indexer for commented BLAST tabular files.""" 546 handle = self._handle 547 handle.seek(0) 548 start_offset = 0 549 # mark of a new query 550 query_mark = None 551 # mark of the query's ID 552 qid_mark = _as_bytes('# Query: ') 553 # mark of the last line 554 end_mark = _as_bytes('# BLAST processed') 555 556 while True: 557 end_offset = handle.tell() 558 line = handle.readline() 559 560 if query_mark is None: 561 query_mark = line 562 start_offset = end_offset 563 elif line.startswith(qid_mark): 564 qresult_key = line[len(qid_mark):].split()[0] 565 elif line == query_mark or line.startswith(end_mark): 566 yield qresult_key, start_offset, end_offset - start_offset 567 start_offset = end_offset 568 elif not line: 569 break
570
571 - def _qresult_index(self):
572 """Indexer for noncommented BLAST tabular files.""" 573 handle = self._handle 574 handle.seek(0) 575 start_offset = 0 576 qresult_key = None 577 key_idx = self._key_idx 578 tab_char = _as_bytes('\t') 579 580 while True: 581 # get end offset here since we only know a qresult ends after 582 # encountering the next one 583 end_offset = handle.tell() 584 # line = handle.readline() 585 line = handle.readline() 586 587 if qresult_key is None: 588 qresult_key = line.split(tab_char)[key_idx] 589 else: 590 try: 591 curr_key = line.split(tab_char)[key_idx] 592 except IndexError: 593 curr_key = _as_bytes('') 594 595 if curr_key != qresult_key: 596 yield qresult_key, start_offset, end_offset - start_offset 597 qresult_key = curr_key 598 start_offset = end_offset 599 600 # break if we've reached EOF 601 if not line: 602 break
603
604 - def get_raw(self, offset):
605 """Returns the raw string of a QueryResult object from the given offset.""" 606 if self._kwargs['comments']: 607 getfunc = self._get_raw_qresult_commented 608 else: 609 getfunc = self._get_raw_qresult 610 611 return getfunc(offset)
612
613 - def _get_raw_qresult(self, offset):
614 """Returns the raw string of a single QueryResult from a noncommented file.""" 615 handle = self._handle 616 handle.seek(offset) 617 qresult_raw = _as_bytes('') 618 tab_char = _as_bytes('\t') 619 key_idx = self._key_idx 620 qresult_key = None 621 622 while True: 623 line = handle.readline() 624 # get the key if the first line (qresult key) 625 if qresult_key is None: 626 qresult_key = line.split(tab_char)[key_idx] 627 else: 628 try: 629 curr_key = line.split(tab_char)[key_idx] 630 except IndexError: 631 curr_key = _as_bytes('') 632 # only break when qresult is finished (key is different) 633 if curr_key != qresult_key: 634 break 635 # append to the raw string as long as qresult is the same 636 qresult_raw += line 637 638 return qresult_raw
639
640 - def _get_raw_qresult_commented(self, offset):
641 """Returns the raw string of a single QueryResult from a commented file.""" 642 handle = self._handle 643 handle.seek(offset) 644 qresult_raw = _as_bytes('') 645 end_mark = _as_bytes('# BLAST processed') 646 647 # query mark is the line marking a new query 648 # something like '# TBLASTN 2.2.25+' 649 query_mark = None 650 line = handle.readline() 651 while line: 652 # since query_mark depends on the BLAST search, we need to obtain it 653 # first 654 if query_mark is None: 655 query_mark = line 656 # break when we've reached the next qresult or the search ends 657 elif line == query_mark or line.startswith(end_mark): 658 break 659 660 qresult_raw += line 661 line = handle.readline() 662 663 return qresult_raw
664 665
666 -class BlastTabWriter(object):
667 668 """Writer for blast-tab output format.""" 669
670 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
671 self.handle = handle 672 self.has_comments = comments 673 self.fields = fields
674
675 - def write_file(self, qresults):
676 """Writes to the handle, returns how many QueryResult objects are written.""" 677 handle = self.handle 678 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 679 680 for qresult in qresults: 681 if self.has_comments: 682 handle.write(self._build_comments(qresult)) 683 if qresult: 684 handle.write(self._build_rows(qresult)) 685 if not self.has_comments: 686 qresult_counter += 1 687 hit_counter += len(qresult) 688 hsp_counter += sum(len(hit) for hit in qresult) 689 frag_counter += sum(len(hit.fragments) for hit in qresult) 690 # if it's commented and there are no hits in the qresult, we still 691 # increment the counter 692 if self.has_comments: 693 qresult_counter += 1 694 695 # commented files have a line saying how many queries were processed 696 if self.has_comments: 697 handle.write('# BLAST processed %i queries' % qresult_counter) 698 699 return qresult_counter, hit_counter, hsp_counter, frag_counter
700
701 - def _build_rows(self, qresult):
702 """Returns a string containing tabular rows of the QueryResult object.""" 703 coordinates = set(['qstart', 'qend', 'sstart', 'send']) 704 qresult_lines = '' 705 for hit in qresult: 706 for hsp in hit: 707 line = [] 708 for field in self.fields: 709 # get the column value ~ could either be an attribute 710 # of qresult, hit, or hsp 711 if field in _COLUMN_QRESULT: 712 value = getattr(qresult, _COLUMN_QRESULT[field][0]) 713 elif field in _COLUMN_HIT: 714 if field == 'sallseqid': 715 value = getattr(hit, 'id_all') 716 else: 717 value = getattr(hit, _COLUMN_HIT[field][0]) 718 # special case, since 'frames' can be determined from 719 # query frame and hit frame 720 elif field == 'frames': 721 value = '%i/%i' % (hsp.query_frame, hsp.hit_frame) 722 elif field in _COLUMN_HSP: 723 try: 724 value = getattr(hsp, _COLUMN_HSP[field][0]) 725 except AttributeError: 726 attr = _COLUMN_HSP[field][0] 727 _augment_blast_hsp(hsp, attr) 728 value = getattr(hsp, attr) 729 elif field in _COLUMN_FRAG: 730 value = getattr(hsp, _COLUMN_FRAG[field][0]) 731 else: 732 assert field not in _SUPPORTED_FIELDS 733 continue 734 735 # adjust from and to according to strand, if from and to 736 # is included in the output field 737 if field in coordinates: 738 value = self._adjust_coords(field, value, hsp) 739 # adjust output formatting 740 value = self._adjust_output(field, value) 741 742 line.append(value) 743 744 hsp_line = '\t'.join(line) 745 qresult_lines += hsp_line + '\n' 746 747 return qresult_lines
748
749 - def _adjust_coords(self, field, value, hsp):
750 """Adjusts start and end coordinates according to strand.""" 751 assert field in ('qstart', 'qend', 'sstart', 'send') 752 # determine sequence type to operate on based on field's first letter 753 seq_type = 'query' if field.startswith('q') else 'hit' 754 755 strand = getattr(hsp, '%s_strand' % seq_type, None) 756 if strand is None: 757 raise ValueError("Required attribute %r not found." % 758 ('%s_strand' % (seq_type))) 759 # switch start <--> end coordinates if strand is -1 760 if strand < 0: 761 if field.endswith('start'): 762 value = getattr(hsp, '%s_end' % seq_type) 763 elif field.endswith('end'): 764 value = getattr(hsp, '%s_start' % seq_type) + 1 765 elif field.endswith('start'): 766 # adjust start coordinate for positive strand 767 value += 1 768 769 return value
770
771 - def _adjust_output(self, field, value):
772 """Adjusts formatting of the given field and value to mimic native tab output.""" 773 # qseq and sseq are stored as SeqRecord, but here we only need the str 774 if field in ('qseq', 'sseq'): 775 value = str(value.seq) 776 777 # evalue formatting, adapted from BLAST+ source: 778 # src/objtools/align_format/align_format_util.cpp#L668 779 elif field == 'evalue': 780 if value < 1.0e-180: 781 value = '0.0' 782 elif value < 1.0e-99: 783 value = '%2.0e' % value 784 elif value < 0.0009: 785 value = '%3.0e' % value 786 elif value < 0.1: 787 value = '%4.3f' % value 788 elif value < 1.0: 789 value = '%3.2f' % value 790 elif value < 10.0: 791 value = '%2.1f' % value 792 else: 793 value = '%5.0f' % value 794 795 # pident and ppos formatting 796 elif field in ('pident', 'ppos'): 797 value = '%.2f' % value 798 799 # evalue formatting, adapted from BLAST+ source: 800 # src/objtools/align_format/align_format_util.cpp#L723 801 elif field == 'bitscore': 802 if value > 9999: 803 value = '%4.3e' % value 804 elif value > 99.9: 805 value = '%4.0d' % value 806 else: 807 value = '%4.1f' % value 808 809 # coverages have no comma (using floats still ~ a more proper 810 # representation) 811 elif field in ('qcovhsp', 'qcovs'): 812 value = '%.0f' % value 813 814 # list into '<>'-delimited string 815 elif field == 'salltitles': 816 value = '<>'.join(value) 817 818 # list into ';'-delimited string 819 elif field in ('sallseqid', 'sallacc', 'staxids', 'sscinames', 820 'scomnames', 'sblastnames', 'sskingdoms'): 821 value = ';'.join(value) 822 823 # everything else 824 else: 825 value = str(value) 826 827 return value
828
829 - def _build_comments(self, qres):
830 """Returns a string of a QueryResult tabular comment.""" 831 comments = [] 832 # inverse mapping of the long-short name map, required 833 # for writing comments 834 inv_field_map = dict((v, k) for k, v in _LONG_SHORT_MAP.items()) 835 836 # try to anticipate qress without version 837 if not hasattr(qres, 'version'): 838 program_line = '# %s' % qres.program.upper() 839 else: 840 program_line = '# %s %s' % (qres.program.upper(), qres.version) 841 comments.append(program_line) 842 # description may or may not be None 843 if qres.description is None: 844 comments.append('# Query: %s' % qres.id) 845 else: 846 comments.append('# Query: %s %s' % (qres.id, qres.description)) 847 # try appending RID line, if present 848 try: 849 comments.append('# RID: %s' % qres.rid) 850 except AttributeError: 851 pass 852 comments.append('# Database: %s' % qres.target) 853 # qresults without hits don't show the Fields comment 854 if qres: 855 comments.append('# Fields: %s' % 856 ', '.join(inv_field_map[field] for field in self.fields)) 857 comments.append('# %i hits found' % len(qres)) 858 859 return '\n'.join(comments) + '\n'
860 861 862 # if not used as a module, run the doctest 863 if __name__ == "__main__": 864 from Bio._utils import run_doctest 865 run_doctest() 866