Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_tab
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_tab

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7   
  8  """Bio.SearchIO parser for BLAST+ tab output format, with or without comments.""" 
  9   
 10  import re 
 11   
 12  from Bio._py3k import _as_bytes, _bytes_to_string 
 13  from Bio._py3k import basestring 
 14   
 15  from Bio.SearchIO._index import SearchIndexer 
 16  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 17   
 18   
 19  __all__ = ('BlastTabIndexer', 'BlastTabParser', 'BlastTabWriter') 
 20   
 21   
 22  # longname-shortname map 
 23  # maps the column names shown in a commented output to its short name 
 24  # (the one used in the command line) 
 25  _LONG_SHORT_MAP = { 
 26      'query id': 'qseqid', 
 27      'query acc.': 'qacc', 
 28      'query acc.ver': 'qaccver', 
 29      'query length': 'qlen', 
 30      'subject id': 'sseqid', 
 31      'subject acc.': 'sacc', 
 32      'subject acc.ver': 'saccver', 
 33      'subject length': 'slen', 
 34      'alignment length': 'length', 
 35      'bit score': 'bitscore', 
 36      'score': 'score', 
 37      'evalue': 'evalue', 
 38      'identical': 'nident', 
 39      '% identity': 'pident', 
 40      'positives': 'positive', 
 41      '% positives': 'ppos', 
 42      'mismatches': 'mismatch', 
 43      'gaps': 'gaps', 
 44      'q. start': 'qstart', 
 45      'q. end': 'qend', 
 46      's. start': 'sstart', 
 47      's. end': 'send', 
 48      'query frame': 'qframe', 
 49      'sbjct frame': 'sframe', 
 50      'query/sbjct frames': 'frames', 
 51      'query seq': 'qseq', 
 52      'subject seq': 'sseq', 
 53      'gap opens': 'gapopen', 
 54      'query gi': 'qgi', 
 55      'subject ids': 'sallseqid', 
 56      'subject gi': 'sgi', 
 57      'subject gis': 'sallgi', 
 58      'BTOP': 'btop', 
 59      'subject accs.': 'sallacc', 
 60      'subject tax ids': 'staxids', 
 61      'subject sci names': 'sscinames', 
 62      'subject com names': 'scomnames', 
 63      'subject blast names': 'sblastnames', 
 64      'subject super kingdoms': 'sskingdoms', 
 65      'subject title': 'stitle', 
 66      'subject titles': 'salltitles', 
 67      'subject strand': 'sstrand', 
 68      '% subject coverage': 'qcovs', 
 69      '% hsp coverage': 'qcovhsp', 
 70  } 
 71   
 72  # function to create a list from semicolon-delimited string 
 73  # used in BlastTabParser._parse_result_row 
 74  _list_semicol = lambda x: x.split(';') 
 75  _list_diamond = lambda x: x.split('<>') 
 76  # column to class attribute map 
 77  _COLUMN_QRESULT = { 
 78      'qseqid': ('id', str), 
 79      'qacc': ('accession', str), 
 80      'qaccver': ('accession_version', str), 
 81      'qlen': ('seq_len', int), 
 82      'qgi': ('gi', str), 
 83  } 
 84  _COLUMN_HIT = { 
 85      'sseqid': ('id', str), 
 86      'sallseqid': ('id_all', _list_semicol), 
 87      'sacc': ('accession', str), 
 88      'saccver': ('accession_version', str), 
 89      'sallacc': ('accession_all', _list_semicol), 
 90      'sgi': ('gi', str), 
 91      'sallgi': ('gi_all', str), 
 92      'slen': ('seq_len', int), 
 93      'staxids': ('tax_ids', _list_semicol), 
 94      'sscinames': ('sci_names', _list_semicol), 
 95      'scomnames': ('com_names', _list_semicol), 
 96      'sblastnames': ('blast_names', _list_semicol), 
 97      'sskingdoms': ('super_kingdoms', _list_semicol), 
 98      'stitle': ('title', str), 
 99      'salltitles': ('title_all', _list_diamond), 
100      # set strand as HSP property? 
101      'sstrand': ('strand', str), 
102      'qcovs': ('query_coverage', float), 
103  } 
104  _COLUMN_HSP = { 
105      'bitscore': ('bitscore', float), 
106      'score': ('bitscore_raw', int), 
107      'evalue': ('evalue', float), 
108      'nident': ('ident_num', int), 
109      'pident': ('ident_pct', float), 
110      'positive': ('pos_num', int), 
111      'ppos': ('pos_pct', float), 
112      'mismatch': ('mismatch_num', int), 
113      'gaps': ('gap_num', int), 
114      'gapopen': ('gapopen_num', int), 
115      'btop': ('btop', str), 
116      'qcovhsp': ('query_coverage', float), 
117  } 
118  _COLUMN_FRAG = { 
119      'length': ('aln_span', int), 
120      'qstart': ('query_start', int), 
121      'qend': ('query_end', int), 
122      'sstart': ('hit_start', int), 
123      'send': ('hit_end', int), 
124      'qframe': ('query_frame', int), 
125      'sframe': ('hit_frame', int), 
126      'frames': ('frames', str), 
127      'qseq': ('query', str), 
128      'sseq': ('hit', str), 
129  } 
130  _SUPPORTED_FIELDS = set(list(_COLUMN_QRESULT) + list(_COLUMN_HIT) + 
131                          list(_COLUMN_HSP) + list(_COLUMN_FRAG)) 
132   
133  # column order in the non-commented tabular output variant 
134  # values must be keys inside the column-attribute maps above 
135  _DEFAULT_FIELDS = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
136          'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] 
137  # one field from each of the following sets must exist in order for the 
138  # parser to work 
139  _MIN_QUERY_FIELDS = set(['qseqid', 'qacc', 'qaccver']) 
140  _MIN_HIT_FIELDS = set(['sseqid', 'sacc', 'saccver', 'sallseqid']) 
141   
142  # simple function to create BLAST HSP attributes that may be computed if 
143  # other certain attributes are present 
144  # This was previously implemented in the HSP objects in the old model 
145   
146  _RE_GAPOPEN = re.compile(r'\w-') 
147   
148   
149 -def _compute_gapopen_num(hsp):
150 """Return the number of gap openings in the given HSP (PRIVATE).""" 151 gapopen = 0 152 for seq_type in ('query', 'hit'): 153 seq = str(getattr(hsp, seq_type).seq) 154 gapopen += len(re.findall(_RE_GAPOPEN, seq)) 155 return gapopen
156 157
158 -def _augment_blast_hsp(hsp, attr):
159 """Calculate the given HSP attribute, for writing (PRIVATE).""" 160 if attr == 'aln_span': 161 # aln_span is number of identical matches + mismatches + gaps 162 func = lambda hsp: hsp.ident_num + hsp.mismatch_num + hsp.gap_num 163 # ident and gap will require the num values to be computed first 164 elif attr.startswith('ident'): 165 func = lambda hsp: hsp.aln_span - hsp.mismatch_num - hsp.gap_num 166 elif attr.startswith('gap'): 167 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.mismatch_num 168 elif attr == 'mismatch_num': 169 func = lambda hsp: hsp.aln_span - hsp.ident_num - hsp.gap_num 170 elif attr == 'gapopen_num': 171 if not hasattr(hsp, 'query') or not hasattr(hsp, 'hit'): 172 # mock function so that the except clause below is triggered 173 # as both the query and hit are required to compute gapopen 174 def mock(hsp): 175 raise AttributeError
176 func = mock 177 else: 178 func = _compute_gapopen_num 179 180 # set the num values 181 # requires the endswith check, since we only want to set 'num' or 'span' 182 # attributes here 183 if not hasattr(hsp, attr) and not attr.endswith('_pct'): 184 value = func(hsp) 185 setattr(hsp, attr, value) 186 187 # if the attr is a percent value, calculate it 188 if attr == 'ident_pct': 189 func2 = lambda hsp: hsp.ident_num / float(hsp.aln_span) * 100 190 elif attr == 'pos_pct': 191 func = lambda hsp: hsp.pos_num / float(hsp.aln_span) * 100 192 elif attr == 'gap_pct': 193 func2 = lambda hsp: hsp.gap_num / float(hsp.aln_span) * 100 194 else: 195 func2 = None 196 197 # set the pct values 198 if func2 is not None: 199 value = func2(hsp) 200 setattr(hsp, attr, value) 201 202
203 -class BlastTabParser(object):
204 """Parser for the BLAST tabular format.""" 205
206 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
207 """Initialize the class.""" 208 self.handle = handle 209 self.has_comments = comments 210 self.fields = self._prep_fields(fields) 211 self.line = self.handle.readline().strip()
212
213 - def __iter__(self):
214 # stop iteration if file has no lines 215 if not self.line: 216 return 217 # determine which iterator to use 218 elif self.has_comments: 219 iterfunc = self._parse_commented_qresult 220 else: 221 if self.line.startswith("#"): 222 raise ValueError("Encountered unexpected character '#' at the" 223 " beginning of a line. Set comments=True if" 224 " the file is a commented file.") 225 iterfunc = self._parse_qresult 226 227 for qresult in iterfunc(): 228 yield qresult
229
230 - def _prep_fields(self, fields):
231 """Validate and format the given fields for use by the parser (PRIVATE).""" 232 # cast into list if fields is a space-separated string 233 if isinstance(fields, basestring): 234 fields = fields.strip().split(' ') 235 # blast allows 'std' as a proxy for the standard default lists 236 # we want to transform 'std' to its proper column names 237 if 'std' in fields: 238 idx = fields.index('std') 239 fields = fields[:idx] + _DEFAULT_FIELDS + fields[idx + 1:] 240 # if set(fields) has a null intersection with minimum required 241 # fields for hit and query, raise an exception 242 if not set(fields).intersection(_MIN_QUERY_FIELDS) or \ 243 not set(fields).intersection(_MIN_HIT_FIELDS): 244 raise ValueError("Required query and/or hit ID field not found.") 245 246 return fields
247
248 - def _parse_commented_qresult(self):
249 """Yield `QueryResult` objects from a commented file (PRIVATE).""" 250 while True: 251 comments = self._parse_comments() 252 if comments: 253 try: 254 self.fields = comments['fields'] 255 # iterator for the query results 256 qres_iter = self._parse_qresult() 257 except KeyError: 258 # no fields means the query has no results 259 assert 'fields' not in comments 260 # create an iterator returning one empty qresult 261 # if the query has no results 262 qres_iter = iter([QueryResult()]) 263 264 for qresult in qres_iter: 265 for key, value in comments.items(): 266 setattr(qresult, key, value) 267 yield qresult 268 269 else: 270 break
271
272 - def _parse_comments(self):
273 """Return a dictionary containing tab file comments (PRIVATE).""" 274 comments = {} 275 while True: 276 # parse program and version 277 # example: # BLASTX 2.2.26+ 278 if 'BLAST' in self.line and 'processed' not in self.line: 279 program_line = self.line[len(' #'):].split(' ') 280 comments['program'] = program_line[0].lower() 281 comments['version'] = program_line[1] 282 # parse query id and description (if available) 283 # example: # Query: gi|356995852 Mus musculus POU domain 284 elif 'Query' in self.line: 285 query_line = self.line[len('# Query: '):].split(' ', 1) 286 comments['id'] = query_line[0] 287 if len(query_line) == 2: 288 comments['description'] = query_line[1] 289 # parse target database 290 # example: # Database: db/minirefseq_protein 291 elif 'Database' in self.line: 292 comments['target'] = self.line[len('# Database: '):] 293 # parse RID (from remote searches) 294 elif 'RID' in self.line: 295 comments['rid'] = self.line[len('# RID: '):] 296 # parse column order, required for parsing the result lines 297 # example: # Fields: query id, query gi, query acc., query length 298 elif 'Fields' in self.line: 299 comments['fields'] = self._parse_fields_line() 300 # if the line has these strings, it's either the end of a comment 301 # or the end of a file, so we return all the comments we've parsed 302 elif ' hits found' in self.line or 'processed' in self.line: 303 self.line = self.handle.readline().strip() 304 return comments 305 306 self.line = self.handle.readline() 307 308 if not self.line: 309 return comments 310 else: 311 self.line = self.line.strip()
312
313 - def _parse_fields_line(self):
314 """Return column short names line from 'Fields' comment line (PRIVATE).""" 315 raw_field_str = self.line[len('# Fields: '):] 316 long_fields = raw_field_str.split(', ') 317 fields = [_LONG_SHORT_MAP[long_name] for long_name in long_fields] 318 return self._prep_fields(fields)
319
320 - def _parse_result_row(self):
321 """Return a dictionary of parsed row values (PRIVATE).""" 322 fields = self.fields 323 columns = self.line.strip().split('\t') 324 assert len(fields) == len(columns), "Expected %i columns, found: " \ 325 "%i" % (len(fields), len(columns)) 326 327 qresult, hit, hsp, frag = {}, {}, {}, {} 328 for idx, value in enumerate(columns): 329 sname = fields[idx] 330 # flag to check if any of the _COLUMNs contain sname 331 in_mapping = False 332 # iterate over each dict, mapping pair to determine 333 # attribute name and value of each column 334 for parsed_dict, mapping in ( 335 (qresult, _COLUMN_QRESULT), 336 (hit, _COLUMN_HIT), 337 (hsp, _COLUMN_HSP), 338 (frag, _COLUMN_FRAG)): 339 # process parsed value according to mapping 340 if sname in mapping: 341 attr_name, caster = mapping[sname] 342 if caster is not str: 343 value = caster(value) 344 parsed_dict[attr_name] = value 345 in_mapping = True 346 # make sure that any unhandled field is not supported 347 if not in_mapping: 348 assert sname not in _SUPPORTED_FIELDS 349 350 return {'qresult': qresult, 'hit': hit, 'hsp': hsp, 'frag': frag}
351
352 - def _get_id(self, parsed):
353 """Return the value used for a QueryResult or Hit ID from a parsed row (PRIVATE).""" 354 # use 'id', with 'id_all', 'accession' and 'accession_version' 355 # fallbacks one of these must have a value since we've checked whether 356 # they exist or not when parsing the comments 357 id_cache = parsed.get('id') 358 if id_cache is None and 'id_all' in parsed: 359 id_cache = parsed.get('id_all')[0] 360 if id_cache is None: 361 id_cache = parsed.get('accession') 362 if id_cache is None: 363 id_cache = parsed.get('accession_version') 364 365 return id_cache
366
367 - def _parse_qresult(self):
368 """Yield QueryResult objects (PRIVATE).""" 369 # state values, used to determine what to do with each line 370 state_EOF = 0 371 state_QRES_NEW = 1 372 state_QRES_SAME = 3 373 state_HIT_NEW = 2 374 state_HIT_SAME = 4 375 # dummies for initial states 376 qres_state = None 377 hit_state = None 378 file_state = None 379 cur_qid = None 380 cur_hid = None 381 # dummies for initial id caches 382 prev_qid = None 383 prev_hid = None 384 # dummies for initial parsed value containers 385 cur, prev = None, None 386 hit_list, hsp_list = [], [] 387 388 while True: 389 # store previous line's parsed values if we've past the first line 390 if cur is not None: 391 prev = cur 392 prev_qid = cur_qid 393 prev_hid = cur_hid 394 # only parse the line if it's not EOF or not a comment line 395 if self.line and not self.line.startswith('#'): 396 cur = self._parse_result_row() 397 cur_qid = self._get_id(cur['qresult']) 398 cur_hid = self._get_id(cur['hit']) 399 else: 400 file_state = state_EOF 401 # mock values for cur_qid and cur_hid since the line is empty 402 cur_qid, cur_hid = None, None 403 404 # get the state of hit and qresult 405 if prev_qid != cur_qid: 406 qres_state = state_QRES_NEW 407 else: 408 qres_state = state_QRES_SAME 409 # new hits are hits with different id or hits in a new qresult 410 if prev_hid != cur_hid or qres_state == state_QRES_NEW: 411 hit_state = state_HIT_NEW 412 else: 413 hit_state = state_HIT_SAME 414 415 # we're creating objects for the previously parsed line(s), 416 # so nothing is done in the first parsed line (prev == None) 417 if prev is not None: 418 # every line is essentially an HSP with one fragment, so we 419 # create both of these for every line 420 frag = HSPFragment(prev_hid, prev_qid) 421 for attr, value in prev['frag'].items(): 422 # adjust coordinates to Python range 423 # NOTE: this requires both start and end coords to be 424 # present, otherwise a KeyError will be raised. 425 # Without this limitation, we might misleadingly set the 426 # start / end coords 427 for seq_type in ('query', 'hit'): 428 if attr == seq_type + '_start': 429 value = min(value, 430 prev['frag'][seq_type + '_end']) - 1 431 elif attr == seq_type + '_end': 432 value = max(value, 433 prev['frag'][seq_type + '_start']) 434 setattr(frag, attr, value) 435 # strand and frame setattr require the full parsed values 436 # to be set first 437 for seq_type in ('hit', 'query'): 438 # try to set hit and query frame 439 frame = self._get_frag_frame(frag, seq_type, 440 prev['frag']) 441 setattr(frag, '%s_frame' % seq_type, frame) 442 # try to set hit and query strand 443 strand = self._get_frag_strand(frag, seq_type, 444 prev['frag']) 445 setattr(frag, '%s_strand' % seq_type, strand) 446 447 hsp = HSP([frag]) 448 for attr, value in prev['hsp'].items(): 449 setattr(hsp, attr, value) 450 hsp_list.append(hsp) 451 452 # create hit and append to temp hit container if hit_state 453 # says we're not at the same hit or at a new query 454 if hit_state == state_HIT_NEW: 455 hit = Hit(hsp_list) 456 for attr, value in prev['hit'].items(): 457 if attr != 'id_all': 458 setattr(hit, attr, value) 459 else: 460 # not setting hit ID since it's already set from the 461 # prev_hid above 462 setattr(hit, '_id_alt', value[1:]) 463 hit_list.append(hit) 464 hsp_list = [] 465 # create qresult and yield if we're at a new qresult or EOF 466 if qres_state == state_QRES_NEW or file_state == state_EOF: 467 qresult = QueryResult(hit_list, prev_qid) 468 for attr, value in prev['qresult'].items(): 469 setattr(qresult, attr, value) 470 yield qresult 471 # if current line is EOF, break 472 if file_state == state_EOF: 473 break 474 hit_list = [] 475 476 self.line = self.handle.readline().strip()
477
478 - def _get_frag_frame(self, frag, seq_type, parsedict):
479 """Return fragment frame for given object (PRIVATE). 480 481 Returns `HSPFragment` frame given the object, its sequence type, 482 and its parsed dictionary values. 483 """ 484 assert seq_type in ('query', 'hit') 485 frame = getattr(frag, '%s_frame' % seq_type, None) 486 if frame is not None: 487 return frame 488 else: 489 if 'frames' in parsedict: 490 # frames is 'x1/x2' string, x1 is query frame, x2 is hit frame 491 idx = 0 if seq_type == 'query' else 1 492 return int(parsedict['frames'].split('/')[idx])
493 # else implicit None return 494
495 - def _get_frag_strand(self, frag, seq_type, parsedict):
496 """Return fragment strand for given object (PRIVATE). 497 498 Returns `HSPFragment` strand given the object, its sequence type, 499 and its parsed dictionary values. 500 """ 501 # NOTE: this will never set the strands as 0 for protein 502 # queries / hits, since we can't detect the blast flavors 503 # from the columns alone. 504 assert seq_type in ('query', 'hit') 505 strand = getattr(frag, '%s_strand' % seq_type, None) 506 if strand is not None: 507 return strand 508 else: 509 # using parsedict instead of the fragment object since 510 # we need the unadjusted coordinated values 511 start = parsedict.get('%s_start' % seq_type) 512 end = parsedict.get('%s_end' % seq_type) 513 if start is not None and end is not None: 514 return 1 if start <= end else -1
515 # else implicit None return 516 517
518 -class BlastTabIndexer(SearchIndexer):
519 """Indexer class for BLAST+ tab output.""" 520 521 _parser = BlastTabParser 522
523 - def __init__(self, filename, comments=False, fields=_DEFAULT_FIELDS):
524 """Initialize the class.""" 525 SearchIndexer.__init__(self, filename, comments=comments, fields=fields) 526 527 # if the file doesn't have comments, 528 # get index of column used as the key (qseqid / qacc / qaccver) 529 if not self._kwargs['comments']: 530 if 'qseqid' in fields: 531 self._key_idx = fields.index('qseqid') 532 elif 'qacc' in fields: 533 self._key_idx = fields.index('qacc') 534 elif 'qaccver' in fields: 535 self._key_idx = fields.index('qaccver') 536 else: 537 raise ValueError("Custom fields is missing an ID column. " 538 "One of these must be present: 'qseqid', 'qacc', or 'qaccver'.")
539
540 - def __iter__(self):
541 """Iterate over the file handle; yields key, start offset, and length.""" 542 handle = self._handle 543 handle.seek(0) 544 545 if not self._kwargs['comments']: 546 iterfunc = self._qresult_index 547 else: 548 iterfunc = self._qresult_index_commented 549 550 for key, offset, length in iterfunc(): 551 yield _bytes_to_string(key), offset, length
552
553 - def _qresult_index_commented(self):
554 """Indexer for commented BLAST tabular files (PRIVATE).""" 555 handle = self._handle 556 handle.seek(0) 557 start_offset = 0 558 # mark of a new query 559 query_mark = None 560 # mark of the query's ID 561 qid_mark = _as_bytes('# Query: ') 562 # mark of the last line 563 end_mark = _as_bytes('# BLAST processed') 564 565 while True: 566 end_offset = handle.tell() 567 line = handle.readline() 568 569 if query_mark is None: 570 query_mark = line 571 start_offset = end_offset 572 elif line.startswith(qid_mark): 573 qresult_key = line[len(qid_mark):].split()[0] 574 elif line == query_mark or line.startswith(end_mark): 575 yield qresult_key, start_offset, end_offset - start_offset 576 start_offset = end_offset 577 elif not line: 578 break
579
580 - def _qresult_index(self):
581 """Indexer for noncommented BLAST tabular files (PRIVATE).""" 582 handle = self._handle 583 handle.seek(0) 584 start_offset = 0 585 qresult_key = None 586 key_idx = self._key_idx 587 tab_char = _as_bytes('\t') 588 589 while True: 590 # get end offset here since we only know a qresult ends after 591 # encountering the next one 592 end_offset = handle.tell() 593 # line = handle.readline() 594 line = handle.readline() 595 596 if qresult_key is None: 597 qresult_key = line.split(tab_char)[key_idx] 598 else: 599 try: 600 curr_key = line.split(tab_char)[key_idx] 601 except IndexError: 602 curr_key = _as_bytes('') 603 604 if curr_key != qresult_key: 605 yield qresult_key, start_offset, end_offset - start_offset 606 qresult_key = curr_key 607 start_offset = end_offset 608 609 # break if we've reached EOF 610 if not line: 611 break
612
613 - def get_raw(self, offset):
614 """Return the raw bytes string of a QueryResult object from the given offset.""" 615 if self._kwargs['comments']: 616 getfunc = self._get_raw_qresult_commented 617 else: 618 getfunc = self._get_raw_qresult 619 620 return getfunc(offset)
621
622 - def _get_raw_qresult(self, offset):
623 """Return the raw bytes string of a single QueryResult from a noncommented file (PRIVATE).""" 624 handle = self._handle 625 handle.seek(offset) 626 qresult_raw = _as_bytes('') 627 tab_char = _as_bytes('\t') 628 key_idx = self._key_idx 629 qresult_key = None 630 631 while True: 632 line = handle.readline() 633 # get the key if the first line (qresult key) 634 if qresult_key is None: 635 qresult_key = line.split(tab_char)[key_idx] 636 else: 637 try: 638 curr_key = line.split(tab_char)[key_idx] 639 except IndexError: 640 curr_key = _as_bytes('') 641 # only break when qresult is finished (key is different) 642 if curr_key != qresult_key: 643 break 644 # append to the raw string as long as qresult is the same 645 qresult_raw += line 646 647 return qresult_raw
648
649 - def _get_raw_qresult_commented(self, offset):
650 """Return the bytes raw string of a single QueryResult from a commented file (PRIVATE).""" 651 handle = self._handle 652 handle.seek(offset) 653 qresult_raw = _as_bytes('') 654 end_mark = _as_bytes('# BLAST processed') 655 656 # query mark is the line marking a new query 657 # something like '# TBLASTN 2.2.25+' 658 query_mark = None 659 line = handle.readline() 660 while line: 661 # since query_mark depends on the BLAST search, we need to obtain it 662 # first 663 if query_mark is None: 664 query_mark = line 665 # break when we've reached the next qresult or the search ends 666 elif line == query_mark or line.startswith(end_mark): 667 break 668 669 qresult_raw += line 670 line = handle.readline() 671 672 return qresult_raw
673 674
675 -class BlastTabWriter(object):
676 """Writer for blast-tab output format.""" 677
678 - def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
679 """Initialize the class.""" 680 self.handle = handle 681 self.has_comments = comments 682 self.fields = fields
683
684 - def write_file(self, qresults):
685 """Write to the handle, return how many QueryResult objects were written.""" 686 handle = self.handle 687 qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0 688 689 for qresult in qresults: 690 if self.has_comments: 691 handle.write(self._build_comments(qresult)) 692 if qresult: 693 handle.write(self._build_rows(qresult)) 694 if not self.has_comments: 695 qresult_counter += 1 696 hit_counter += len(qresult) 697 hsp_counter += sum(len(hit) for hit in qresult) 698 frag_counter += sum(len(hit.fragments) for hit in qresult) 699 # if it's commented and there are no hits in the qresult, we still 700 # increment the counter 701 if self.has_comments: 702 qresult_counter += 1 703 704 # commented files have a line saying how many queries were processed 705 if self.has_comments: 706 handle.write('# BLAST processed %i queries' % qresult_counter) 707 708 return qresult_counter, hit_counter, hsp_counter, frag_counter
709
710 - def _build_rows(self, qresult):
711 """Return a string containing tabular rows of the QueryResult object (PRIVATE).""" 712 coordinates = set(['qstart', 'qend', 'sstart', 'send']) 713 qresult_lines = '' 714 for hit in qresult: 715 for hsp in hit: 716 line = [] 717 for field in self.fields: 718 # get the column value ~ could either be an attribute 719 # of qresult, hit, or hsp 720 if field in _COLUMN_QRESULT: 721 value = getattr(qresult, _COLUMN_QRESULT[field][0]) 722 elif field in _COLUMN_HIT: 723 if field == 'sallseqid': 724 value = getattr(hit, 'id_all') 725 else: 726 value = getattr(hit, _COLUMN_HIT[field][0]) 727 # special case, since 'frames' can be determined from 728 # query frame and hit frame 729 elif field == 'frames': 730 value = '%i/%i' % (hsp.query_frame, hsp.hit_frame) 731 elif field in _COLUMN_HSP: 732 try: 733 value = getattr(hsp, _COLUMN_HSP[field][0]) 734 except AttributeError: 735 attr = _COLUMN_HSP[field][0] 736 _augment_blast_hsp(hsp, attr) 737 value = getattr(hsp, attr) 738 elif field in _COLUMN_FRAG: 739 value = getattr(hsp, _COLUMN_FRAG[field][0]) 740 else: 741 assert field not in _SUPPORTED_FIELDS 742 continue 743 744 # adjust from and to according to strand, if from and to 745 # is included in the output field 746 if field in coordinates: 747 value = self._adjust_coords(field, value, hsp) 748 # adjust output formatting 749 value = self._adjust_output(field, value) 750 751 line.append(value) 752 753 hsp_line = '\t'.join(line) 754 qresult_lines += hsp_line + '\n' 755 756 return qresult_lines
757
758 - def _adjust_coords(self, field, value, hsp):
759 """Adjust start and end coordinates according to strand (PRIVATE).""" 760 assert field in ('qstart', 'qend', 'sstart', 'send') 761 # determine sequence type to operate on based on field's first letter 762 seq_type = 'query' if field.startswith('q') else 'hit' 763 764 strand = getattr(hsp, '%s_strand' % seq_type, None) 765 if strand is None: 766 raise ValueError("Required attribute %r not found." % 767 ('%s_strand' % (seq_type))) 768 # switch start <--> end coordinates if strand is -1 769 if strand < 0: 770 if field.endswith('start'): 771 value = getattr(hsp, '%s_end' % seq_type) 772 elif field.endswith('end'): 773 value = getattr(hsp, '%s_start' % seq_type) + 1 774 elif field.endswith('start'): 775 # adjust start coordinate for positive strand 776 value += 1 777 778 return value
779
780 - def _adjust_output(self, field, value):
781 """Adjust formatting of given field and value to mimic native tab output (PRIVATE).""" 782 # qseq and sseq are stored as SeqRecord, but here we only need the str 783 if field in ('qseq', 'sseq'): 784 value = str(value.seq) 785 786 # evalue formatting, adapted from BLAST+ source: 787 # src/objtools/align_format/align_format_util.cpp#L668 788 elif field == 'evalue': 789 if value < 1.0e-180: 790 value = '0.0' 791 elif value < 1.0e-99: 792 value = '%2.0e' % value 793 elif value < 0.0009: 794 value = '%3.0e' % value 795 elif value < 0.1: 796 value = '%4.3f' % value 797 elif value < 1.0: 798 value = '%3.2f' % value 799 elif value < 10.0: 800 value = '%2.1f' % value 801 else: 802 value = '%5.0f' % value 803 804 # pident and ppos formatting 805 elif field in ('pident', 'ppos'): 806 value = '%.2f' % value 807 808 # evalue formatting, adapted from BLAST+ source: 809 # src/objtools/align_format/align_format_util.cpp#L723 810 elif field == 'bitscore': 811 if value > 9999: 812 value = '%4.3e' % value 813 elif value > 99.9: 814 value = '%4.0d' % value 815 else: 816 value = '%4.1f' % value 817 818 # coverages have no comma (using floats still ~ a more proper 819 # representation) 820 elif field in ('qcovhsp', 'qcovs'): 821 value = '%.0f' % value 822 823 # list into '<>'-delimited string 824 elif field == 'salltitles': 825 value = '<>'.join(value) 826 827 # list into ';'-delimited string 828 elif field in ('sallseqid', 'sallacc', 'staxids', 'sscinames', 829 'scomnames', 'sblastnames', 'sskingdoms'): 830 value = ';'.join(value) 831 832 # everything else 833 else: 834 value = str(value) 835 836 return value
837
838 - def _build_comments(self, qres):
839 """Return QueryResult tabular comment as a string (PRIVATE).""" 840 comments = [] 841 # inverse mapping of the long-short name map, required 842 # for writing comments 843 inv_field_map = dict((v, k) for k, v in _LONG_SHORT_MAP.items()) 844 845 # try to anticipate qress without version 846 if not hasattr(qres, 'version'): 847 program_line = '# %s' % qres.program.upper() 848 else: 849 program_line = '# %s %s' % (qres.program.upper(), qres.version) 850 comments.append(program_line) 851 # description may or may not be None 852 if qres.description is None: 853 comments.append('# Query: %s' % qres.id) 854 else: 855 comments.append('# Query: %s %s' % (qres.id, qres.description)) 856 # try appending RID line, if present 857 try: 858 comments.append('# RID: %s' % qres.rid) 859 except AttributeError: 860 pass 861 comments.append('# Database: %s' % qres.target) 862 # qresults without hits don't show the Fields comment 863 if qres: 864 comments.append('# Fields: %s' % 865 ', '.join(inv_field_map[field] for field in self.fields)) 866 comments.append('# %i hits found' % len(qres)) 867 868 return '\n'.join(comments) + '\n'
869 870 871 # if not used as a module, run the doctest 872 if __name__ == "__main__": 873 from Bio._utils import run_doctest 874 run_doctest() 875