Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_xml
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_xml

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ XML output formats.""" 
  7  # for more info: http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd 
  8   
  9  import sys 
 10  import re 
 11  import warnings 
 12  from itertools import chain 
 13  from xml.sax.saxutils import XMLGenerator, escape 
 14   
 15  from Bio import BiopythonParserWarning 
 16   
 17  # For speed try to use cElementTree rather than ElementTree 
 18  try: 
 19      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 20          # Workaround for bug in python 3.0 and 3.1, 
 21          # see http://bugs.python.org/issue9257 
 22          from xml.etree import ElementTree as ElementTree 
 23      else: 
 24          from xml.etree import cElementTree as ElementTree 
 25  except ImportError: 
 26      from xml.etree import ElementTree as ElementTree 
 27   
 28   
 29  from Bio.Alphabet import generic_dna, generic_protein 
 30  from Bio.SearchIO._index import SearchIndexer 
 31  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 32   
 33  from Bio._py3k import _as_bytes, _bytes_to_string, unicode 
 34  _empty_bytes_string = _as_bytes("") 
 35   
 36  __all__ = ['BlastXmlParser', 'BlastXmlIndexer', 'BlastXmlWriter'] 
 37   
 38   
 39  # element - optional qresult attribute name mapping 
 40  _ELEM_QRESULT_OPT = { 
 41      'Statistics_db-num': ('stat_db_num', int), 
 42      'Statistics_db-len': ('stat_db_len', int), 
 43      'Statistics_eff-space': ('stat_eff_space', float), 
 44      'Statistics_hsp-len': ('stat_hsp_len', int), 
 45      'Statistics_kappa': ('stat_kappa', float), 
 46      'Statistics_lambda': ('stat_lambda', float), 
 47      'Statistics_entropy': ('stat_entropy', float), 
 48  } 
 49  # element - hit attribute name mapping 
 50  _ELEM_HIT = { 
 51      # 'Hit_def': ('description', str),   # not set by this dict 
 52      'Hit_accession': ('accession', str), 
 53      'Hit_len': ('seq_len', int), 
 54  } 
 55  # element - hsp attribute name mapping 
 56  _ELEM_HSP = { 
 57      'Hsp_bit-score': ('bitscore', float), 
 58      'Hsp_score': ('bitscore_raw', int), 
 59      'Hsp_evalue': ('evalue', float), 
 60      'Hsp_identity': ('ident_num', int), 
 61      'Hsp_positive': ('pos_num', int), 
 62      'Hsp_gaps': ('gap_num', int), 
 63      'Hsp_density': ('density', float), 
 64  } 
 65  # element - fragment attribute name mapping 
 66  _ELEM_FRAG = { 
 67      'Hsp_query-from': ('query_start', int), 
 68      'Hsp_query-to': ('query_end', int), 
 69      'Hsp_hit-from': ('hit_start', int), 
 70      'Hsp_hit-to': ('hit_end', int), 
 71      'Hsp_query-frame': ('query_frame', int), 
 72      'Hsp_hit-frame': ('hit_frame', int), 
 73      'Hsp_align-len': ('aln_span', int), 
 74      'Hsp_pattern-from': ('pattern_start', int), 
 75      'Hsp_pattern-to': ('pattern_end', int), 
 76      'Hsp_hseq': ('hit', str), 
 77      'Hsp_qseq': ('query', str), 
 78  } 
 79  # dictionary for mapping tag name and meta key name 
 80  _ELEM_META = { 
 81      'BlastOutput_db': ('target', str), 
 82      'BlastOutput_program': ('program', str), 
 83      'BlastOutput_version': ('version', str), 
 84      'BlastOutput_reference': ('reference', str), 
 85      'Parameters_expect': ('param_evalue_threshold', float), 
 86      'Parameters_entrez-query': ('param_entrez_query', str), 
 87      'Parameters_filter': ('param_filter', str), 
 88      'Parameters_gap-extend': ('param_gap_extend', int), 
 89      'Parameters_gap-open': ('param_gap_open', int), 
 90      'Parameters_include': ('param_include', str), 
 91      'Parameters_matrix': ('param_matrix', str), 
 92      'Parameters_pattern': ('param_pattern', str), 
 93      'Parameters_sc-match': ('param_score_match', int), 
 94      'Parameters_sc-mismatch': ('param_score_mismatch', int), 
 95  } 
 96  # these are fallback tags that store information on the first query 
 97  # outside the <Iteration> tag 
 98  # only used if query_{ID,def,len} is not found in <Iteration> 
 99  # (seen in legacy Blast <2.2.14) 
100  _ELEM_QRESULT_FALLBACK = { 
101      'BlastOutput_query-ID': ('id', str), 
102      'BlastOutput_query-def': ('description', str), 
103      'BlastOutput_query-len': ('len', str), 
104  } 
105  # element-attribute maps, for writing 
106  _WRITE_MAPS = { 
107      'preamble': ( 
108          ('program', 'program'), 
109          ('version', 'version'), 
110          ('reference', 'reference'), 
111          ('db', 'target'), 
112          ('query-ID', 'id'), 
113          ('query-def', 'description'), 
114          ('query-len', 'seq_len'), 
115          ('param', None), 
116      ), 
117      'param': ( 
118          ('matrix', 'param_matrix'), 
119          ('expect', 'param_evalue_threshold'), 
120          ('sc-match', 'param_score_match'), 
121          ('sc-mismatch', 'param_score_mismatch'), 
122          ('gap-open', 'param_gap_open'), 
123          ('gap-extend', 'param_gap_extend'), 
124          ('filter', 'param_filter'), 
125          ('pattern', 'param_pattern'), 
126          ('entrez-query', 'param_entrez_query'), 
127      ), 
128      'qresult': ( 
129          ('query-ID', 'id'), 
130          ('query-def', 'description'), 
131          ('query-len', 'seq_len'), 
132      ), 
133      'stat': ( 
134          ('db-num', 'stat_db_num'), 
135          ('db-len', 'stat_db_len'), 
136          ('hsp-len', 'stat_hsp_len'), 
137          ('eff-space', 'stat_eff_space'), 
138          ('kappa', 'stat_kappa'), 
139          ('lambda', 'stat_lambda'), 
140          ('entropy', 'stat_entropy'), 
141      ), 
142      'hit': ( 
143          ('id', 'id'), 
144          ('def', 'description'), 
145          ('accession', 'accession'), 
146          ('len', 'seq_len'), 
147      ), 
148      'hsp': ( 
149          ('bit-score', 'bitscore'), 
150          ('score', 'bitscore_raw'), 
151          ('evalue', 'evalue'), 
152          ('query-from', 'query_start'), 
153          ('query-to', 'query_end'), 
154          ('hit-from', 'hit_start'), 
155          ('hit-to', 'hit_end'), 
156          ('pattern-from', 'pattern_start'), 
157          ('pattern-to', 'pattern_end'), 
158          ('query-frame', 'query_frame'), 
159          ('hit-frame', 'hit_frame'), 
160          ('identity', 'ident_num'), 
161          ('positive', 'pos_num'), 
162          ('gaps', 'gap_num'), 
163          ('align-len', 'aln_span'), 
164          ('density', 'density'), 
165          ('qseq', 'query'), 
166          ('hseq', 'hit'), 
167          ('midline', None), 
168      ), 
169  } 
170  # optional elements, based on the DTD 
171  _DTD_OPT = ( 
172      'BlastOutput_query-seq', 'BlastOutput_mbstat', 'Iteration_query-def', 
173      'Iteration_query-len', 'Iteration-hits', 'Iteration_stat', 
174      'Iteration_message', 'Parameters_matrix', 'Parameters_include', 
175      'Parameters_sc-match', 'Parameters_sc-mismatch', 'Parameters_filter', 
176      'Parameters_pattern', 'Parameters_entrez-query', 'Hit_hsps', 
177      'Hsp_pattern-from', 'Hsp_pattern-to', 'Hsp_query-frame', 'Hsp_hit-frame', 
178      'Hsp_identity', 'Hsp_positive', 'Hsp_gaps', 'Hsp_align-len', 'Hsp_density', 
179      'Hsp_midline', 
180  ) 
181   
182  # compile RE patterns 
183  # for capturing BLAST version 
184  _RE_VERSION = re.compile(r'\d+\.\d+\.\d+\+?') 
185  # for splitting ID-description pairs 
186  _RE_ID_DESC_PAIRS_PATTERN = re.compile(" +>") 
187  # for splitting ID and description (must be used with maxsplit = 1) 
188  _RE_ID_DESC_PATTERN = re.compile(" +") 
189   
190   
191 -def _extract_ids_and_descs(concat_str):
192 # Given a string space-separate string of IDs and descriptions, 193 # return a list of tuples, each tuple containing an ID and 194 # a description string (which may be empty) 195 196 # create a list of lists, each list containing an ID and description 197 # or just an ID, if description is not present 198 id_desc_pairs = [re.split(_RE_ID_DESC_PATTERN, x, 1) 199 for x in re.split(_RE_ID_DESC_PAIRS_PATTERN, concat_str)] 200 # make sure empty descriptions are added as empty strings 201 # also, we return lists for compatibility reasons between Py2 and Py3 202 add_descs = lambda x: x if len(x) == 2 else x + [""] 203 return [pair for pair in map(add_descs, id_desc_pairs)]
204 205
206 -class BlastXmlParser(object):
207 """Parser for the BLAST XML format""" 208
209 - def __init__(self, handle):
210 self.xml_iter = iter(ElementTree.iterparse(handle, events=('start', 'end'))) 211 self._meta, self._fallback = self._parse_preamble()
212
213 - def __iter__(self):
214 for qresult in self._parse_qresult(): 215 yield qresult
216
217 - def _parse_preamble(self):
218 """Parses all tag data prior to the first query result.""" 219 # dictionary for containing all information prior to the first query 220 meta = {} 221 # dictionary for fallback information 222 fallback = {} 223 224 # parse the preamble part (anything prior to the first result) 225 for event, elem in self.xml_iter: 226 # get the tag values, cast appropriately, store into meta 227 if event == 'end' and elem.tag in _ELEM_META: 228 attr_name, caster = _ELEM_META[elem.tag] 229 230 if caster is not str: 231 meta[attr_name] = caster(elem.text) 232 else: 233 meta[attr_name] = elem.text 234 235 # delete element after we finish parsing it 236 elem.clear() 237 continue 238 # capture fallback values 239 # these are used only if the first <Iteration> does not have any 240 # ID, ref, or len. 241 elif event == 'end' and elem.tag in _ELEM_QRESULT_FALLBACK: 242 attr_name, caster = _ELEM_QRESULT_FALLBACK[elem.tag] 243 244 if caster is not str: 245 fallback[attr_name] = caster(elem.text) 246 else: 247 fallback[attr_name] = elem.text 248 249 elem.clear() 250 continue 251 252 if event == 'start' and elem.tag == 'Iteration': 253 break 254 255 # we only want the version number, sans the program name or date 256 if meta.get('version') is not None: 257 meta['version'] = re.search(_RE_VERSION, 258 meta['version']).group(0) 259 260 return meta, fallback
261
262 - def _parse_qresult(self):
263 """Parses query results.""" 264 # parse the queries 265 for event, qresult_elem in self.xml_iter: 266 # </Iteration> marks the end of a single query 267 # which means we can process it 268 if event == 'end' and qresult_elem.tag == 'Iteration': 269 270 # we'll use the following schema 271 # <!ELEMENT Iteration ( 272 # Iteration_iter-num, 273 # Iteration_query-ID?, 274 # Iteration_query-def?, 275 # Iteration_query-len?, 276 # Iteration_hits?, 277 # Iteration_stat?, 278 # Iteration_message?)> 279 280 # assign query attributes with fallbacks 281 query_id = qresult_elem.findtext('Iteration_query-ID') 282 if query_id is None: 283 query_id = self._fallback['id'] 284 285 query_desc = qresult_elem.findtext('Iteration_query-def') 286 if query_desc is None: 287 query_desc = self._fallback['description'] 288 289 query_len = qresult_elem.findtext('Iteration_query-len') 290 if query_len is None: 291 query_len = self._fallback['len'] 292 293 # handle blast searches against databases with Blast's IDs 294 # 'Query_' marks the beginning of a BLAST+-generated ID, 295 # 'lcl|' marks the beginning of a BLAST legacy-generated ID 296 if query_id.startswith('Query_') or query_id.startswith('lcl|'): 297 # store the Blast-generated query ID 298 blast_query_id = query_id 299 id_desc = query_desc.split(' ', 1) 300 query_id = id_desc[0] 301 try: 302 query_desc = id_desc[1] 303 except IndexError: 304 query_desc = '' 305 else: 306 blast_query_id = '' 307 308 hit_list, key_list = [], [] 309 for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), 310 query_id): 311 if hit: 312 # need to keep track of hit IDs, since there could be duplicates, 313 if hit.id in key_list: 314 warnings.warn("Adding hit with BLAST-generated ID " 315 "%r since hit ID %r is already present " 316 "in query %r. Your BLAST database may contain " 317 "duplicate entries." % 318 (hit._blast_id, hit.id, query_id), BiopythonParserWarning) 319 # fallback to Blast-generated IDs, if the ID is already present 320 # and restore the desc, too 321 hit.description = '%s %s' % (hit.id, hit.description) 322 hit.id = hit._blast_id 323 # and change the hit_id of the HSPs contained 324 for hsp in hit: 325 hsp.hit_id = hit._blast_id 326 else: 327 key_list.append(hit.id) 328 329 hit_list.append(hit) 330 331 # create qresult and assign its attributes 332 qresult = QueryResult(hit_list, query_id) 333 qresult.description = query_desc 334 qresult.seq_len = int(query_len) 335 qresult._blast_id = blast_query_id 336 for key, value in self._meta.items(): 337 setattr(qresult, key, value) 338 339 # statistics are stored in Iteration_stat's 'grandchildren' with the 340 # following DTD 341 # <!ELEMENT Statistics ( 342 # Statistics_db-num, 343 # Statistics_db-len, 344 # Statistics_hsp-len, 345 # Statistics_eff-space, 346 # Statistics_kappa, 347 # Statistics_lambda, 348 # Statistics_entropy)> 349 350 stat_iter_elem = qresult_elem.find('Iteration_stat') 351 if stat_iter_elem is not None: 352 stat_elem = stat_iter_elem.find('Statistics') 353 354 for key, val_info in _ELEM_QRESULT_OPT.items(): 355 value = stat_elem.findtext(key) 356 if value is not None: 357 caster = val_info[1] 358 # recast only if value is not intended to be str 359 if value is not None and caster is not str: 360 value = caster(value) 361 setattr(qresult, val_info[0], value) 362 363 # delete element after we finish parsing it 364 qresult_elem.clear() 365 yield qresult
366
367 - def _parse_hit(self, root_hit_elem, query_id):
368 """Generator that transforms Iteration_hits XML elements into Hit objects. 369 370 :param root_hit_elem: root element of the Iteration_hits tag. 371 :type root_hit_elem: XML element tag 372 :param query_id: QueryResult ID of this Hit 373 :type query_id: string 374 375 """ 376 # Hit level processing 377 # Hits are stored in the Iteration_hits tag, with the following 378 # DTD 379 # <!ELEMENT Hit ( 380 # Hit_num, 381 # Hit_id, 382 # Hit_def, 383 # Hit_accession, 384 # Hit_len, 385 # Hit_hsps?)> 386 387 # feed the loop below an empty list so iteration still works 388 if root_hit_elem is None: 389 root_hit_elem = [] 390 391 for hit_elem in root_hit_elem: 392 393 # create empty hit object 394 hit_id = hit_elem.findtext('Hit_id') 395 hit_desc = hit_elem.findtext('Hit_def') 396 # handle blast searches against databases with Blast's IDs 397 if hit_id.startswith('gnl|BL_ORD_ID|'): 398 blast_hit_id = hit_id 399 id_desc = hit_desc.split(' ', 1) 400 hit_id = id_desc[0] 401 try: 402 hit_desc = id_desc[1] 403 except IndexError: 404 hit_desc = '' 405 else: 406 blast_hit_id = '' 407 408 # combine primary ID and defline first before splitting 409 full_id_desc = hit_id + ' ' + hit_desc 410 id_descs = _extract_ids_and_descs(full_id_desc) 411 hit_id, hit_desc = id_descs[0] 412 413 hsps = [hsp for hsp in 414 self._parse_hsp(hit_elem.find('Hit_hsps'), 415 query_id, hit_id)] 416 417 hit = Hit(hsps) 418 hit.description = hit_desc 419 hit._id_alt = [x[0] for x in id_descs[1:]] 420 hit._description_alt = [x[1] for x in id_descs[1:]] 421 # blast_hit_id is only set if the hit ID is Blast-generated 422 hit._blast_id = blast_hit_id 423 424 for key, val_info in _ELEM_HIT.items(): 425 value = hit_elem.findtext(key) 426 if value is not None: 427 caster = val_info[1] 428 # recast only if value is not intended to be str 429 if value is not None and caster is not str: 430 value = caster(value) 431 setattr(hit, val_info[0], value) 432 433 # delete element after we finish parsing it 434 hit_elem.clear() 435 yield hit
436
437 - def _parse_hsp(self, root_hsp_frag_elem, query_id, hit_id):
438 """Iterator that transforms Hit_hsps XML elements into HSP objects. 439 440 :param root_hsp_frag_elem: the ``Hit_hsps`` tag 441 :type root_hsp_frag_elem: XML element tag 442 :param query_id: query ID 443 :type query_id: string 444 :param hit_id: hit ID 445 :type hit_id: string 446 447 """ 448 # Hit_hsps DTD: 449 # <!ELEMENT Hsp ( 450 # Hsp_num, 451 # Hsp_bit-score, 452 # Hsp_score, 453 # Hsp_evalue, 454 # Hsp_query-from, 455 # Hsp_query-to, 456 # Hsp_hit-from, 457 # Hsp_hit-to, 458 # Hsp_pattern-from?, 459 # Hsp_pattern-to?, 460 # Hsp_query-frame?, 461 # Hsp_hit-frame?, 462 # Hsp_identity?, 463 # Hsp_positive?, 464 # Hsp_gaps?, 465 # Hsp_align-len?, 466 # Hsp_density?, 467 # Hsp_qseq, 468 # Hsp_hseq, 469 # Hsp_midline?)> 470 471 # if value is None, feed the loop below an empty list 472 if root_hsp_frag_elem is None: 473 root_hsp_frag_elem = [] 474 475 for hsp_frag_elem in root_hsp_frag_elem: 476 coords = {} # temporary container for coordinates 477 frag = HSPFragment(hit_id, query_id) 478 for key, val_info in _ELEM_FRAG.items(): 479 value = hsp_frag_elem.findtext(key) 480 caster = val_info[1] 481 482 # adjust 'from' and 'to' coordinates to 0-based ones 483 if value is not None: 484 if key.endswith('-from') or key.endswith('-to'): 485 # store coordinates for further processing 486 coords[val_info[0]] = caster(value) 487 continue 488 # recast only if value is not intended to be str 489 elif caster is not str: 490 value = caster(value) 491 setattr(frag, val_info[0], value) 492 493 # set the similarity characters into aln_annotation dict 494 frag.aln_annotation['similarity'] = \ 495 hsp_frag_elem.findtext('Hsp_midline') 496 497 # process coordinates 498 # since 'x-from' could be bigger than 'x-to', we need to figure 499 # out which one is smaller/bigger since 'x_start' is always smaller 500 # than 'x_end' 501 for coord_type in ('query', 'hit', 'pattern'): 502 start_type = coord_type + '_start' 503 end_type = coord_type + '_end' 504 try: 505 start = coords[start_type] 506 end = coords[end_type] 507 except KeyError: 508 continue 509 else: 510 # convert to python range and setattr 511 setattr(frag, start_type, min(start, end) - 1) 512 setattr(frag, end_type, max(start, end)) 513 514 # set alphabet, based on program 515 prog = self._meta.get('program') 516 if prog == 'blastn': 517 frag.alphabet = generic_dna 518 elif prog in ['blastp', 'blastx', 'tblastn', 'tblastx']: 519 frag.alphabet = generic_protein 520 521 hsp = HSP([frag]) 522 for key, val_info in _ELEM_HSP.items(): 523 value = hsp_frag_elem.findtext(key) 524 caster = val_info[1] 525 if value is not None: 526 if caster is not str: 527 value = caster(value) 528 setattr(hsp, val_info[0], value) 529 # delete element after we finish parsing it 530 hsp_frag_elem.clear() 531 yield hsp
532 533
534 -class BlastXmlIndexer(SearchIndexer):
535 """Indexer class for BLAST XML output.""" 536 537 _parser = BlastXmlParser 538 qstart_mark = _as_bytes('<Iteration>') 539 qend_mark = _as_bytes('</Iteration>') 540 block_size = 16384 541
542 - def __init__(self, filename):
543 SearchIndexer.__init__(self, filename) 544 # TODO: better way to do this? 545 iter_obj = self._parser(self._handle) 546 self._meta, self._fallback = iter_obj._meta, iter_obj._fallback
547
548 - def __iter__(self):
549 qstart_mark = self.qstart_mark 550 qend_mark = self.qend_mark 551 blast_id_mark = _as_bytes('Query_') 552 block_size = self.block_size 553 handle = self._handle 554 handle.seek(0) 555 re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)' 556 '</Iteration_query-ID>\s+?<Iteration_query-def>' 557 '(.*?)</Iteration_query-def>')) 558 re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) 559 counter = 0 560 561 while True: 562 start_offset = handle.tell() 563 line = handle.readline() 564 if not line: 565 break 566 if qstart_mark not in line: 567 continue 568 # The following requirements are to make supporting BGZF compressed 569 # BLAST XML files simpler (avoids complex offset manipulations): 570 assert line.count(qstart_mark) == 1, "XML without line breaks?" 571 assert line.lstrip().startswith(qstart_mark), line 572 if qend_mark in line: 573 # Should cope with <Iteration>...</Iteration> on one long line 574 block = line 575 else: 576 # Load the rest of this block up to and including </Iteration> 577 block = [line] 578 while line and qend_mark not in line: 579 line = handle.readline() 580 assert qstart_mark not in line, line 581 block.append(line) 582 assert line.rstrip().endswith(qend_mark), line 583 block = _empty_bytes_string.join(block) 584 assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block 585 assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block 586 # Now we have a full <Iteration>...</Iteration> block, find the ID 587 regx = re.search(re_desc, block) 588 try: 589 qstart_desc = regx.group(2) 590 qstart_id = regx.group(1) 591 except AttributeError: 592 # use the fallback values 593 assert re.search(re_desc_end, block) 594 qstart_desc = _as_bytes(self._fallback['description']) 595 qstart_id = _as_bytes(self._fallback['id']) 596 if qstart_id.startswith(blast_id_mark): 597 qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] 598 yield _bytes_to_string(qstart_id), start_offset, len(block) 599 counter += 1
600
601 - def _parse(self, handle):
602 # overwrites SearchIndexer._parse, since we need to set the meta and 603 # fallback dictionaries to the parser 604 generator = self._parser(handle, **self._kwargs) 605 generator._meta = self._meta 606 generator._fallback = self._fallback 607 return next(iter(generator))
608
609 - def get_raw(self, offset):
610 """Return the raw record from the file as a bytes string.""" 611 qend_mark = self.qend_mark 612 handle = self._handle 613 handle.seek(offset) 614 615 qresult_raw = handle.readline() 616 assert qresult_raw.lstrip().startswith(self.qstart_mark) 617 while qend_mark not in qresult_raw: 618 qresult_raw += handle.readline() 619 assert qresult_raw.rstrip().endswith(qend_mark) 620 assert qresult_raw.count(qend_mark) == 1 621 # Note this will include any leading and trailing whitespace, in 622 # general expecting " <Iteration>\n...\n </Iteration>\n" 623 return qresult_raw
624 625
626 -class _BlastXmlGenerator(XMLGenerator):
627 """Event-based XML Generator.""" 628
629 - def __init__(self, out, encoding='utf-8', indent=" ", increment=2):
630 XMLGenerator.__init__(self, out, encoding) 631 # the indentation character 632 self._indent = indent 633 # nest level 634 self._level = 0 635 # how many indentation character should we increment per level 636 self._increment = increment 637 # container for names of tags with children 638 self._parent_stack = [] 639 # determine writer method 640 try: 641 # this should work for all platforms except Jython 642 self.write = self._write 643 except AttributeError: 644 # Jython uses self._out.write 645 self.write = self._out.write
646
647 - def startDocument(self):
648 """Starts the XML document.""" 649 self.write(u'<?xml version="1.0"?>\n' 650 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' 651 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n')
652
653 - def startElement(self, name, attrs=None, children=False):
654 """Starts an XML element. 655 656 :param name: element name 657 :type name: string 658 :param attrs: element attributes 659 :type attrs: dictionary {string: object} 660 :param children: whether the element has children or not 661 :type children: bool 662 663 """ 664 if attrs is None: 665 attrs = {} 666 self.ignorableWhitespace(self._indent * self._level) 667 XMLGenerator.startElement(self, name, attrs)
668
669 - def endElement(self, name):
670 """Ends and XML element of the given name.""" 671 XMLGenerator.endElement(self, name) 672 self.write(u'\n')
673
674 - def startParent(self, name, attrs=None):
675 """Starts an XML element which has children. 676 677 :param name: element name 678 :type name: string 679 :param attrs: element attributes 680 :type attrs: dictionary {string: object} 681 682 """ 683 if attrs is None: 684 attrs = {} 685 self.startElement(name, attrs, children=True) 686 self._level += self._increment 687 self.write(u'\n') 688 # append the element name, so we can end it later 689 self._parent_stack.append(name)
690
691 - def endParent(self):
692 """Ends an XML element with children.""" 693 # the element to end is the one on top of the stack 694 name = self._parent_stack.pop() 695 self._level -= self._increment 696 self.ignorableWhitespace(self._indent * self._level) 697 self.endElement(name)
698
699 - def startParents(self, *names):
700 """Starts XML elements without children.""" 701 for name in names: 702 self.startParent(name)
703
704 - def endParents(self, num):
705 """Ends XML elements, according to the given number.""" 706 for i in range(num): 707 self.endParent()
708
709 - def simpleElement(self, name, content=None):
710 """Creates an XML element without children with the given content.""" 711 self.startElement(name, attrs={}) 712 if content: 713 self.characters(content) 714 self.endElement(name)
715
716 - def characters(self, content):
717 content = escape(unicode(content)) 718 for a, b in ((u'"', u'&quot;'), (u"'", u'&apos;')): 719 content = content.replace(a, b) 720 self.write(content)
721 722
723 -class BlastXmlWriter(object):
724 """Stream-based BLAST+ XML Writer.""" 725
726 - def __init__(self, handle):
727 self.xml = _BlastXmlGenerator(handle, 'utf-8')
728
729 - def write_file(self, qresults):
730 """Writes the XML contents to the output handle.""" 731 xml = self.xml 732 self.qresult_counter, self.hit_counter, self.hsp_counter, \ 733 self.frag_counter = 0, 0, 0, 0 734 735 # get the first qresult, since the preamble requires its attr values 736 first_qresult = next(qresults) 737 # start the XML document, set the root element, and create the preamble 738 xml.startDocument() 739 xml.startParent('BlastOutput') 740 self._write_preamble(first_qresult) 741 # and write the qresults 742 xml.startParent('BlastOutput_iterations') 743 self._write_qresults(chain([first_qresult], qresults)) 744 xml.endParents(2) 745 xml.endDocument() 746 747 return self.qresult_counter, self.hit_counter, self.hsp_counter, \ 748 self.frag_counter
749
750 - def _write_elem_block(self, block_name, map_name, obj, opt_dict=None):
751 """Writes sibling XML elements. 752 753 :param block_name: common element name prefix 754 :type block_name: string 755 :param map_name: name of mapping between element and attribute names 756 :type map_name: string 757 :param obj: object whose attribute value will be used 758 :type obj: object 759 :param opt_dict: custom element-attribute mapping 760 :type opt_dict: dictionary {string: string} 761 762 """ 763 if opt_dict is None: 764 opt_dict = {} 765 for elem, attr in _WRITE_MAPS[map_name]: 766 elem = block_name + elem 767 try: 768 content = str(getattr(obj, attr)) 769 except AttributeError: 770 # ensure attrs that is not present is optional 771 assert elem in _DTD_OPT, "Element %r (attribute %r) not " \ 772 "found" % (elem, attr) 773 else: 774 # custom element-attribute mapping, for fallback values 775 if elem in opt_dict: 776 content = opt_dict[elem] 777 self.xml.simpleElement(elem, content)
778
779 - def _write_preamble(self, qresult):
780 """Writes the XML file preamble.""" 781 xml = self.xml 782 783 for elem, attr in _WRITE_MAPS['preamble']: 784 elem = 'BlastOutput_' + elem 785 if elem == 'BlastOutput_param': 786 xml.startParent(elem) 787 self._write_param(qresult) 788 xml.endParent() 789 continue 790 try: 791 content = str(getattr(qresult, attr)) 792 except AttributeError: 793 assert elem in _DTD_OPT, "Element %s (attribute %s) not " \ 794 "found" % (elem, attr) 795 else: 796 if elem == 'BlastOutput_version': 797 content = '%s %s' % (qresult.program.upper(), 798 qresult.version) 799 elif qresult._blast_id: 800 if elem == 'BlastOutput_query-ID': 801 content = qresult._blast_id 802 elif elem == 'BlastOutput_query-def': 803 content = ' '.join([qresult.id, 804 qresult.description]).strip() 805 xml.simpleElement(elem, content)
806
807 - def _write_param(self, qresult):
808 """Writes the parameter block of the preamble.""" 809 xml = self.xml 810 xml.startParent('Parameters') 811 self._write_elem_block('Parameters_', 'param', qresult) 812 xml.endParent()
813
814 - def _write_qresults(self, qresults):
815 """Writes QueryResult objects into iteration elements.""" 816 xml = self.xml 817 818 for num, qresult in enumerate(qresults): 819 xml.startParent('Iteration') 820 xml.simpleElement('Iteration_iter-num', str(num + 1)) 821 opt_dict = {} 822 # use custom Iteration_query-ID and Iteration_query-def mapping 823 # if the query has a BLAST-generated ID 824 if qresult._blast_id: 825 opt_dict = { 826 'Iteration_query-ID': qresult._blast_id, 827 'Iteration_query-def': ' '.join([qresult.id, 828 qresult.description]).strip(), 829 } 830 self._write_elem_block('Iteration_', 'qresult', qresult, opt_dict) 831 # the Iteration_hits tag only has children if there are hits 832 if qresult: 833 xml.startParent('Iteration_hits') 834 self._write_hits(qresult.hits) 835 xml.endParent() 836 # otherwise it's a simple element without any contents 837 else: 838 xml.simpleElement('Iteration_hits', '') 839 840 xml.startParents('Iteration_stat', 'Statistics') 841 self._write_elem_block('Statistics_', 'stat', qresult) 842 xml.endParents(2) 843 # there's a message if no hits is present 844 if not qresult: 845 xml.simpleElement('Iteration_message', 'No hits found') 846 self.qresult_counter += 1 847 xml.endParent()
848
849 - def _write_hits(self, hits):
850 """Writes Hit objects.""" 851 xml = self.xml 852 853 for num, hit in enumerate(hits): 854 xml.startParent('Hit') 855 xml.simpleElement('Hit_num', str(num + 1)) 856 # use custom hit_id and hit_def mapping if the hit has a 857 # BLAST-generated ID 858 opt_dict = {} 859 if hit._blast_id: 860 opt_dict = { 861 'Hit_id': hit._blast_id, 862 'Hit_def': ' '.join([hit.id, hit.description]).strip(), 863 } 864 self._write_elem_block('Hit_', 'hit', hit, opt_dict) 865 xml.startParent('Hit_hsps') 866 self._write_hsps(hit.hsps) 867 self.hit_counter += 1 868 xml.endParents(2)
869
870 - def _write_hsps(self, hsps):
871 """Writes HSP objects.""" 872 xml = self.xml 873 for num, hsp in enumerate(hsps): 874 xml.startParent('Hsp') 875 xml.simpleElement('Hsp_num', str(num + 1)) 876 for elem, attr in _WRITE_MAPS['hsp']: 877 elem = 'Hsp_' + elem 878 try: 879 content = self._adjust_output(hsp, elem, attr) 880 # make sure any elements that is not present is optional 881 # in the DTD 882 except AttributeError: 883 assert elem in _DTD_OPT, "Element %s (attribute %s) not found" \ 884 % (elem, attr) 885 else: 886 xml.simpleElement(elem, str(content)) 887 self.hsp_counter += 1 888 self.frag_counter += len(hsp.fragments) 889 xml.endParent()
890
891 - def _adjust_output(self, hsp, elem, attr):
892 """Adjusts output to mimic native BLAST+ XML as much as possible.""" 893 894 # adjust coordinates 895 if attr in ('query_start', 'query_end', 'hit_start', 'hit_end', 896 'pattern_start', 'pattern_end'): 897 content = getattr(hsp, attr) + 1 898 if '_start' in attr: 899 content = getattr(hsp, attr) + 1 900 else: 901 content = getattr(hsp, attr) 902 903 # adjust for 'from' <--> 'to' flip if it's not a translated search 904 # and frames are different 905 # adapted from /src/algo/blast/format/blastxml_format.cpp#L216 906 if hsp.query_frame != 0 and hsp.hit_frame < 0: 907 if attr == 'hit_start': 908 content = getattr(hsp, 'hit_end') 909 elif attr == 'hit_end': 910 content = getattr(hsp, 'hit_start') + 1 911 912 # for seqrecord objects, we only need the sequence string 913 elif elem in ('Hsp_hseq', 'Hsp_qseq'): 914 content = str(getattr(hsp, attr).seq) 915 elif elem == 'Hsp_midline': 916 content = hsp.aln_annotation['similarity'] 917 elif elem in ('Hsp_evalue', 'Hsp_bit-score'): 918 # adapted from src/algo/blast/format/blastxml_format.cpp#L138-140 919 content = '%.*g' % (6, getattr(hsp, attr)) 920 else: 921 content = getattr(hsp, attr) 922 923 return content
924 925 926 # if not used as a module, run the doctest 927 if __name__ == "__main__": 928 from Bio._utils import run_doctest 929 run_doctest() 930