Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_xml
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_xml

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SearchIO parser for BLAST+ XML output formats.""" 
  7  # for more info: http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd 
  8   
  9  import sys 
 10  import re 
 11  from itertools import chain 
 12  from xml.sax.saxutils import XMLGenerator, escape 
 13   
 14   
 15  #For speed try to use cElementTree rather than ElementTree 
 16  try: 
 17      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 18          # Workaround for bug in python 3.0 and 3.1, 
 19          # see http://bugs.python.org/issue9257 
 20          from xml.etree import ElementTree as ElementTree 
 21      else: 
 22          from xml.etree import cElementTree as ElementTree 
 23  except ImportError: 
 24      from xml.etree import ElementTree as ElementTree 
 25   
 26   
 27  from Bio._py3k import _as_bytes, _bytes_to_string, unicode 
 28  _empty_bytes_string = _as_bytes("") 
 29   
 30  from Bio.Alphabet import generic_dna, generic_protein 
 31  from Bio.SearchIO._index import SearchIndexer 
 32  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 33   
 34   
 35  __all__ = ['BlastXmlParser', 'BlastXmlIndexer', 'BlastXmlWriter'] 
 36   
 37   
 38  # element - optional qresult attribute name mapping 
 39  _ELEM_QRESULT_OPT = { 
 40      'Statistics_db-num': ('stat_db_num', int), 
 41      'Statistics_db-len': ('stat_db_len', int), 
 42      'Statistics_eff-space': ('stat_eff_space', float), 
 43      'Statistics_hsp-len': ('stat_hsp_len', int), 
 44      'Statistics_kappa': ('stat_kappa', float), 
 45      'Statistics_lambda': ('stat_lambda', float), 
 46      'Statistics_entropy': ('stat_entropy', float), 
 47  } 
 48  # element - hit attribute name mapping 
 49  _ELEM_HIT = { 
 50      #'Hit_def': ('description', str),   # not set by this dict 
 51      'Hit_accession': ('accession', str), 
 52      'Hit_len': ('seq_len', int), 
 53  } 
 54  # element - hsp attribute name mapping 
 55  _ELEM_HSP = { 
 56      'Hsp_bit-score': ('bitscore', float), 
 57      'Hsp_score': ('bitscore_raw', int), 
 58      'Hsp_evalue': ('evalue', float), 
 59      'Hsp_identity': ('ident_num', int), 
 60      'Hsp_positive': ('pos_num', int), 
 61      'Hsp_gaps': ('gap_num', int), 
 62      'Hsp_density': ('density', float), 
 63  } 
 64  # element - fragment attribute name mapping 
 65  _ELEM_FRAG = { 
 66      'Hsp_query-from': ('query_start', int), 
 67      'Hsp_query-to': ('query_end', int), 
 68      'Hsp_hit-from': ('hit_start', int), 
 69      'Hsp_hit-to': ('hit_end', int), 
 70      'Hsp_query-frame': ('query_frame', int), 
 71      'Hsp_hit-frame': ('hit_frame', int), 
 72      'Hsp_align-len': ('aln_span', int), 
 73      'Hsp_pattern-from': ('pattern_start', int), 
 74      'Hsp_pattern-to': ('pattern_end', int), 
 75      'Hsp_hseq': ('hit', str), 
 76      'Hsp_qseq': ('query', str), 
 77  } 
 78  # dictionary for mapping tag name and meta key name 
 79  _ELEM_META = { 
 80      'BlastOutput_db': ('target', str), 
 81      'BlastOutput_program': ('program', str), 
 82      'BlastOutput_version': ('version', str), 
 83      'BlastOutput_reference': ('reference', str), 
 84      'Parameters_expect': ('param_evalue_threshold', float), 
 85      'Parameters_entrez-query': ('param_entrez_query', str), 
 86      'Parameters_filter': ('param_filter', str), 
 87      'Parameters_gap-extend': ('param_gap_extend', int), 
 88      'Parameters_gap-open': ('param_gap_open', int), 
 89      'Parameters_include': ('param_include', str), 
 90      'Parameters_matrix': ('param_matrix', str), 
 91      'Parameters_pattern': ('param_pattern', str), 
 92      'Parameters_sc-match': ('param_score_match', int), 
 93      'Parameters_sc-mismatch': ('param_score_mismatch', int), 
 94  } 
 95  # these are fallback tags that store information on the first query 
 96  # outside the <Iteration> tag 
 97  # only used if query_{ID,def,len} is not found in <Iteration> 
 98  # (seen in legacy Blast <2.2.14) 
 99  _ELEM_QRESULT_FALLBACK = { 
100      'BlastOutput_query-ID': ('id', str), 
101      'BlastOutput_query-def': ('description', str), 
102      'BlastOutput_query-len': ('len', str), 
103  } 
104  # element-attribute maps, for writing 
105  _WRITE_MAPS = { 
106      'preamble': ( 
107          ('program', 'program'), 
108          ('version', 'version'), 
109          ('reference', 'reference'), 
110          ('db', 'target'), 
111          ('query-ID', 'id'), 
112          ('query-def', 'description'), 
113          ('query-len', 'seq_len'), 
114          ('param', None), 
115      ), 
116      'param': ( 
117          ('matrix', 'param_matrix'), 
118          ('expect', 'param_evalue_threshold'), 
119          ('sc-match', 'param_score_match'), 
120          ('sc-mismatch', 'param_score_mismatch'), 
121          ('gap-open', 'param_gap_open'), 
122          ('gap-extend', 'param_gap_extend'), 
123          ('filter', 'param_filter'), 
124          ('pattern', 'param_pattern'), 
125          ('entrez-query', 'param_entrez_query'), 
126      ), 
127      'qresult': ( 
128          ('query-ID', 'id'), 
129          ('query-def', 'description'), 
130          ('query-len', 'seq_len'), 
131      ), 
132      'stat': ( 
133          ('db-num', 'stat_db_num'), 
134          ('db-len', 'stat_db_len'), 
135          ('hsp-len', 'stat_hsp_len'), 
136          ('eff-space', 'stat_eff_space'), 
137          ('kappa', 'stat_kappa'), 
138          ('lambda', 'stat_lambda'), 
139          ('entropy', 'stat_entropy'), 
140      ), 
141      'hit': ( 
142          ('id', 'id'), 
143          ('def', 'description'), 
144          ('accession', 'accession'), 
145          ('len', 'seq_len'), 
146      ), 
147      'hsp': ( 
148          ('bit-score', 'bitscore'), 
149          ('score', 'bitscore_raw'), 
150          ('evalue', 'evalue'), 
151          ('query-from', 'query_start'), 
152          ('query-to', 'query_end'), 
153          ('hit-from', 'hit_start'), 
154          ('hit-to', 'hit_end'), 
155          ('pattern-from', 'pattern_start'), 
156          ('pattern-to', 'pattern_end'), 
157          ('query-frame', 'query_frame'), 
158          ('hit-frame', 'hit_frame'), 
159          ('identity', 'ident_num'), 
160          ('positive', 'pos_num'), 
161          ('gaps', 'gap_num'), 
162          ('align-len', 'aln_span'), 
163          ('density', 'density'), 
164          ('qseq', 'query'), 
165          ('hseq', 'hit'), 
166          ('midline', None), 
167      ), 
168  } 
169  # optional elements, based on the DTD 
170  _DTD_OPT = ( 
171      'BlastOutput_query-seq', 'BlastOutput_mbstat', 'Iteration_query-def', 
172      'Iteration_query-len', 'Iteration-hits', 'Iteration_stat', 
173      'Iteration_message', 'Parameters_matrix', 'Parameters_include', 
174      'Parameters_sc-match', 'Parameters_sc-mismatch', 'Parameters_filter', 
175      'Parameters_pattern', 'Parameters_entrez-query', 'Hit_hsps', 
176      'Hsp_pattern-from', 'Hsp_pattern-to', 'Hsp_query-frame', 'Hsp_hit-frame', 
177      'Hsp_identity', 'Hsp_positive', 'Hsp_gaps', 'Hsp_align-len', 'Hsp_density', 
178      'Hsp_midline', 
179  ) 
180   
181  # compile RE patterns 
182  _RE_VERSION = re.compile(r'\d+\.\d+\.\d+\+?') 
183   
184   
185 -class BlastXmlParser(object):
186 """Parser for the BLAST XML format""" 187
188 - def __init__(self, handle):
189 self.xml_iter = iter(ElementTree.iterparse(handle, events=('start', 'end'))) 190 self._meta, self._fallback = self._parse_preamble()
191
192 - def __iter__(self):
193 for qresult in self._parse_qresult(): 194 yield qresult
195
196 - def _parse_preamble(self):
197 """Parses all tag data prior to the first query result.""" 198 # dictionary for containing all information prior to the first query 199 meta = {} 200 # dictionary for fallback information 201 fallback = {} 202 203 # parse the preamble part (anything prior to the first result) 204 for event, elem in self.xml_iter: 205 # get the tag values, cast appropriately, store into meta 206 if event == 'end' and elem.tag in _ELEM_META: 207 attr_name, caster = _ELEM_META[elem.tag] 208 209 if caster is not str: 210 meta[attr_name] = caster(elem.text) 211 else: 212 meta[attr_name] = elem.text 213 214 # delete element after we finish parsing it 215 elem.clear() 216 continue 217 # capture fallback values 218 # these are used only if the first <Iteration> does not have any 219 # ID, ref, or len. 220 elif event == 'end' and elem.tag in _ELEM_QRESULT_FALLBACK: 221 attr_name, caster = _ELEM_QRESULT_FALLBACK[elem.tag] 222 223 if caster is not str: 224 fallback[attr_name] = caster(elem.text) 225 else: 226 fallback[attr_name] = elem.text 227 228 elem.clear() 229 continue 230 231 if event == 'start' and elem.tag == 'Iteration': 232 break 233 234 # we only want the version number, sans the program name or date 235 if meta.get('version') is not None: 236 meta['version'] = re.search(_RE_VERSION, 237 meta['version']).group(0) 238 239 return meta, fallback
240
241 - def _parse_qresult(self):
242 """Parses query results.""" 243 # parse the queries 244 for event, qresult_elem in self.xml_iter: 245 # </Iteration> marks the end of a single query 246 # which means we can process it 247 if event == 'end' and qresult_elem.tag == 'Iteration': 248 249 # we'll use the following schema 250 # <!ELEMENT Iteration ( 251 # Iteration_iter-num, 252 # Iteration_query-ID?, 253 # Iteration_query-def?, 254 # Iteration_query-len?, 255 # Iteration_hits?, 256 # Iteration_stat?, 257 # Iteration_message?)> 258 259 # assign query attributes with fallbacks 260 query_id = qresult_elem.findtext('Iteration_query-ID') 261 if query_id is None: 262 query_id = self._fallback['id'] 263 264 query_desc = qresult_elem.findtext('Iteration_query-def') 265 if query_desc is None: 266 query_desc = self._fallback['description'] 267 268 query_len = qresult_elem.findtext('Iteration_query-len') 269 if query_len is None: 270 query_len = self._fallback['len'] 271 272 # handle blast searches against databases with Blast's IDs 273 # 'Query_' marks the beginning of a BLAST+-generated ID, 274 # 'lcl|' marks the beginning of a BLAST legacy-generated ID 275 if query_id.startswith('Query_') or query_id.startswith('lcl|'): 276 # store the Blast-generated query ID 277 blast_query_id = query_id 278 id_desc = query_desc.split(' ', 1) 279 query_id = id_desc[0] 280 try: 281 query_desc = id_desc[1] 282 except IndexError: 283 query_desc = '' 284 else: 285 blast_query_id = '' 286 287 hit_list, key_list = [], [] 288 for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), 289 query_id): 290 if hit: 291 # need to keep track of hit IDs, since there could be duplicates, 292 if hit.id in key_list: 293 # fallback to Blast-generated IDs, if the ID is already present 294 # and restore the desc, too 295 hit.description = '%s %s' % (hit.id, hit.description) 296 hit.id = hit._blast_id 297 # and change the hit_id of the HSPs contained 298 for hsp in hit: 299 hsp.hit_id = hit._blast_id 300 else: 301 key_list.append(hit.id) 302 303 hit_list.append(hit) 304 305 # create qresult and assign its attributes 306 qresult = QueryResult(hit_list, query_id) 307 qresult.description = query_desc 308 qresult.seq_len = int(query_len) 309 qresult._blast_id = blast_query_id 310 for key, value in self._meta.items(): 311 setattr(qresult, key, value) 312 313 # statistics are stored in Iteration_stat's 'grandchildren' with the 314 # following DTD 315 # <!ELEMENT Statistics ( 316 # Statistics_db-num, 317 # Statistics_db-len, 318 # Statistics_hsp-len, 319 # Statistics_eff-space, 320 # Statistics_kappa, 321 # Statistics_lambda, 322 # Statistics_entropy)> 323 324 stat_iter_elem = qresult_elem.find('Iteration_stat') 325 if stat_iter_elem is not None: 326 stat_elem = stat_iter_elem.find('Statistics') 327 328 for key, val_info in _ELEM_QRESULT_OPT.items(): 329 value = stat_elem.findtext(key) 330 if value is not None: 331 caster = val_info[1] 332 # recast only if value is not intended to be str 333 if value is not None and caster is not str: 334 value = caster(value) 335 setattr(qresult, val_info[0], value) 336 337 # delete element after we finish parsing it 338 qresult_elem.clear() 339 yield qresult
340
341 - def _parse_hit(self, root_hit_elem, query_id):
342 """Generator that transforms Iteration_hits XML elements into Hit objects. 343 344 Arguments: 345 root_hit_elem -- Element object of the Iteration_hits tag. 346 query_id -- String of QueryResult ID of this Hit 347 348 """ 349 # Hit level processing 350 # Hits are stored in the Iteration_hits tag, with the following 351 # DTD 352 # <!ELEMENT Hit ( 353 # Hit_num, 354 # Hit_id, 355 # Hit_def, 356 # Hit_accession, 357 # Hit_len, 358 # Hit_hsps?)> 359 360 # feed the loop below an empty list so iteration still works 361 if root_hit_elem is None: 362 root_hit_elem = [] 363 364 for hit_elem in root_hit_elem: 365 366 # create empty hit object 367 hit_id = hit_elem.findtext('Hit_id') 368 hit_desc = hit_elem.findtext('Hit_def') 369 # handle blast searches against databases with Blast's IDs 370 if hit_id.startswith('gnl|BL_ORD_ID|'): 371 blast_hit_id = hit_id 372 id_desc = hit_desc.split(' ', 1) 373 hit_id = id_desc[0] 374 try: 375 hit_desc = id_desc[1] 376 except IndexError: 377 hit_desc = '' 378 else: 379 blast_hit_id = '' 380 381 hsps = [hsp for hsp in 382 self._parse_hsp(hit_elem.find('Hit_hsps'), 383 query_id, hit_id)] 384 385 hit = Hit(hsps) 386 hit.description = hit_desc 387 # blast_hit_id is only set if the hit ID is Blast-generated 388 hit._blast_id = blast_hit_id 389 390 for key, val_info in _ELEM_HIT.items(): 391 value = hit_elem.findtext(key) 392 if value is not None: 393 caster = val_info[1] 394 # recast only if value is not intended to be str 395 if value is not None and caster is not str: 396 value = caster(value) 397 setattr(hit, val_info[0], value) 398 399 # delete element after we finish parsing it 400 hit_elem.clear() 401 yield hit
402
403 - def _parse_hsp(self, root_hsp_frag_elem, query_id, hit_id):
404 """Iterator that transforms Hit_hsps XML elements into HSP objects. 405 406 Arguments: 407 root_hsp_frag_elem -- Element object of the Hit_hsps tag. 408 query_id -- Query ID string. 409 hit_id -- Hit ID string. 410 411 """ 412 # Hit_hsps DTD: 413 # <!ELEMENT Hsp ( 414 # Hsp_num, 415 # Hsp_bit-score, 416 # Hsp_score, 417 # Hsp_evalue, 418 # Hsp_query-from, 419 # Hsp_query-to, 420 # Hsp_hit-from, 421 # Hsp_hit-to, 422 # Hsp_pattern-from?, 423 # Hsp_pattern-to?, 424 # Hsp_query-frame?, 425 # Hsp_hit-frame?, 426 # Hsp_identity?, 427 # Hsp_positive?, 428 # Hsp_gaps?, 429 # Hsp_align-len?, 430 # Hsp_density?, 431 # Hsp_qseq, 432 # Hsp_hseq, 433 # Hsp_midline?)> 434 435 # if value is None, feed the loop below an empty list 436 if root_hsp_frag_elem is None: 437 root_hsp_frag_elem = [] 438 439 for hsp_frag_elem in root_hsp_frag_elem: 440 coords = {} # temporary container for coordinates 441 frag = HSPFragment(hit_id, query_id) 442 for key, val_info in _ELEM_FRAG.items(): 443 value = hsp_frag_elem.findtext(key) 444 caster = val_info[1] 445 446 # adjust 'from' and 'to' coordinates to 0-based ones 447 if value is not None: 448 if key.endswith('-from') or key.endswith('-to'): 449 # store coordinates for further processing 450 coords[val_info[0]] = caster(value) 451 continue 452 # recast only if value is not intended to be str 453 elif caster is not str: 454 value = caster(value) 455 setattr(frag, val_info[0], value) 456 457 # set the homology characters into aln_annotation dict 458 frag.aln_annotation['homology'] = \ 459 hsp_frag_elem.findtext('Hsp_midline') 460 461 # process coordinates 462 # since 'x-from' could be bigger than 'x-to', we need to figure 463 # out which one is smaller/bigger since 'x_start' is always smaller 464 # than 'x_end' 465 for coord_type in ('query', 'hit', 'pattern'): 466 start_type = coord_type + '_start' 467 end_type = coord_type + '_end' 468 try: 469 start = coords[start_type] 470 end = coords[end_type] 471 except KeyError: 472 continue 473 else: 474 # convert to python range and setattr 475 setattr(frag, start_type, min(start, end) - 1) 476 setattr(frag, end_type, max(start, end)) 477 478 # set alphabet, based on program 479 prog = self._meta.get('program') 480 if prog == 'blastn': 481 frag.alphabet = generic_dna 482 elif prog in ['blastp', 'blastx', 'tblastn', 'tblastx']: 483 frag.alphabet = generic_protein 484 485 hsp = HSP([frag]) 486 for key, val_info in _ELEM_HSP.items(): 487 value = hsp_frag_elem.findtext(key) 488 caster = val_info[1] 489 if value is not None: 490 if caster is not str: 491 value = caster(value) 492 setattr(hsp, val_info[0], value) 493 # delete element after we finish parsing it 494 hsp_frag_elem.clear() 495 yield hsp
496 497
498 -class BlastXmlIndexer(SearchIndexer):
499 """Indexer class for BLAST XML output.""" 500 501 _parser = BlastXmlParser 502 qstart_mark = _as_bytes('<Iteration>') 503 qend_mark = _as_bytes('</Iteration>') 504 block_size = 16384 505
506 - def __init__(self, filename):
507 SearchIndexer.__init__(self, filename) 508 # TODO: better way to do this? 509 iter_obj = self._parser(self._handle) 510 self._meta, self._fallback = iter_obj._meta, iter_obj._fallback
511
512 - def __iter__(self):
513 qstart_mark = self.qstart_mark 514 qend_mark = self.qend_mark 515 blast_id_mark = _as_bytes('Query_') 516 block_size = self.block_size 517 handle = self._handle 518 handle.seek(0) 519 re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)' 520 '</Iteration_query-ID>\s+?<Iteration_query-def>' 521 '(.*?)</Iteration_query-def>')) 522 re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) 523 counter = 0 524 525 while True: 526 start_offset = handle.tell() 527 line = handle.readline() 528 if not line: 529 break 530 if qstart_mark not in line: 531 continue 532 # The following requirements are to make supporting BGZF compressed 533 # BLAST XML files simpler (avoids complex offset manipulations): 534 assert line.count(qstart_mark) == 1, "XML without line breaks?" 535 assert line.lstrip().startswith(qstart_mark), line 536 if qend_mark in line: 537 # Should cope with <Iteration>...</Iteration> on one long line 538 block = line 539 else: 540 # Load the rest of this block up to and including </Iteration> 541 block = [line] 542 while line and qend_mark not in line: 543 line = handle.readline() 544 assert qstart_mark not in line, line 545 block.append(line) 546 assert line.rstrip().endswith(qend_mark), line 547 block = _empty_bytes_string.join(block) 548 assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block 549 assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block 550 #Now we have a full <Iteration>...</Iteration> block, find the ID 551 regx = re.search(re_desc, block) 552 try: 553 qstart_desc = regx.group(2) 554 qstart_id = regx.group(1) 555 except AttributeError: 556 # use the fallback values 557 assert re.search(re_desc_end, block) 558 qstart_desc = _as_bytes(self._fallback['description']) 559 qstart_id = _as_bytes(self._fallback['id']) 560 if qstart_id.startswith(blast_id_mark): 561 qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] 562 yield _bytes_to_string(qstart_id), start_offset, len(block) 563 counter += 1
564
565 - def _parse(self, handle):
566 # overwrites SearchIndexer._parse, since we need to set the meta and 567 # fallback dictionaries to the parser 568 generator = self._parser(handle, **self._kwargs) 569 generator._meta = self._meta 570 generator._fallback = self._fallback 571 return next(iter(generator))
572
573 - def get_raw(self, offset):
574 qend_mark = self.qend_mark 575 handle = self._handle 576 handle.seek(offset) 577 578 qresult_raw = handle.readline() 579 assert qresult_raw.lstrip().startswith(self.qstart_mark) 580 while qend_mark not in qresult_raw: 581 qresult_raw += handle.readline() 582 assert qresult_raw.rstrip().endswith(qend_mark) 583 assert qresult_raw.count(qend_mark) == 1 584 # Note this will include any leading and trailing whitespace, in 585 # general expecting " <Iteration>\n...\n </Iteration>\n" 586 return qresult_raw
587 588
589 -class _BlastXmlGenerator(XMLGenerator):
590 """Event-based XML Generator.""" 591
592 - def __init__(self, out, encoding='utf-8', indent=" ", increment=2):
593 XMLGenerator.__init__(self, out, encoding) 594 # the indentation character 595 self._indent = indent 596 # nest level 597 self._level = 0 598 # how many indentation character should we increment per level 599 self._increment = increment 600 # container for names of tags with children 601 self._parent_stack = [] 602 # determine writer method 603 try: 604 # this should work for all platforms except Jython 605 self.write = self._write 606 except AttributeError: 607 # Jython uses self._out.write 608 self.write = self._out.write
609
610 - def startDocument(self):
611 """Starts the XML document.""" 612 self.write(u'<?xml version="1.0"?>\n' 613 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' 614 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n')
615
616 - def startElement(self, name, attrs={}, children=False):
617 """Starts an XML element. 618 619 Arguments: 620 name -- String of element name. 621 attrs -- Dictionary of element attributes. 622 children -- Boolean, whether the element has children or not. 623 624 """ 625 self.ignorableWhitespace(self._indent * self._level) 626 XMLGenerator.startElement(self, name, attrs)
627
628 - def endElement(self, name):
629 """Ends and XML element of the given name.""" 630 XMLGenerator.endElement(self, name) 631 self.write(u'\n')
632
633 - def startParent(self, name, attrs={}):
634 """Starts an XML element which has children. 635 636 Arguments: 637 name -- String of element name. 638 attrs -- Dictionary of element attributes. 639 640 """ 641 self.startElement(name, attrs, children=True) 642 self._level += self._increment 643 self.write(u'\n') 644 # append the element name, so we can end it later 645 self._parent_stack.append(name)
646
647 - def endParent(self):
648 """Ends an XML element with children.""" 649 # the element to end is the one on top of the stack 650 name = self._parent_stack.pop() 651 self._level -= self._increment 652 self.ignorableWhitespace(self._indent * self._level) 653 self.endElement(name)
654
655 - def startParents(self, *names):
656 """Starts XML elements without children.""" 657 for name in names: 658 self.startParent(name)
659
660 - def endParents(self, num):
661 """Ends XML elements, according to the given number.""" 662 for i in range(num): 663 self.endParent()
664
665 - def simpleElement(self, name, content=None):
666 """Creates an XML element without children with the given content.""" 667 self.startElement(name, attrs={}) 668 if content: 669 self.characters(content) 670 self.endElement(name)
671
672 - def characters(self, content):
673 content = escape(unicode(content)) 674 for a, b in ((u'"', u'&quot;'), (u"'", u'&apos;')): 675 content = content.replace(a, b) 676 self.write(content)
677 678
679 -class BlastXmlWriter(object):
680 """Stream-based BLAST+ XML Writer.""" 681
682 - def __init__(self, handle):
683 self.xml = _BlastXmlGenerator(handle, 'utf-8')
684
685 - def write_file(self, qresults):
686 """Writes the XML contents to the output handle.""" 687 xml = self.xml 688 self.qresult_counter, self.hit_counter, self.hsp_counter, \ 689 self.frag_counter = 0, 0, 0, 0 690 691 # get the first qresult, since the preamble requires its attr values 692 first_qresult = next(qresults) 693 # start the XML document, set the root element, and create the preamble 694 xml.startDocument() 695 xml.startParent('BlastOutput') 696 self._write_preamble(first_qresult) 697 # and write the qresults 698 xml.startParent('BlastOutput_iterations') 699 self._write_qresults(chain([first_qresult], qresults)) 700 xml.endParents(2) 701 xml.endDocument() 702 703 return self.qresult_counter, self.hit_counter, self.hsp_counter, \ 704 self.frag_counter
705
706 - def _write_elem_block(self, block_name, map_name, obj, opt_dict={}):
707 """Writes sibling XML elements. 708 709 Arguments: 710 block_name -- String of common element name prefix. 711 map_name -- Dictionary name to for mapping element and attribute names. 712 obj -- Object whose attribute values will be used. 713 opt_dict -- Dictionary for custom element-attribute mapping. 714 715 """ 716 for elem, attr in _WRITE_MAPS[map_name]: 717 elem = block_name + elem 718 try: 719 content = str(getattr(obj, attr)) 720 except AttributeError: 721 # ensure attrs that is not present is optional 722 assert elem in _DTD_OPT, "Element %r (attribute %r) not " \ 723 "found" % (elem, attr) 724 else: 725 # custom element-attribute mapping, for fallback values 726 if elem in opt_dict: 727 content = opt_dict[elem] 728 self.xml.simpleElement(elem, content)
729
730 - def _write_preamble(self, qresult):
731 """Writes the XML file preamble.""" 732 xml = self.xml 733 734 for elem, attr in _WRITE_MAPS['preamble']: 735 elem = 'BlastOutput_' + elem 736 if elem == 'BlastOutput_param': 737 xml.startParent(elem) 738 self._write_param(qresult) 739 xml.endParent() 740 continue 741 try: 742 content = str(getattr(qresult, attr)) 743 except AttributeError: 744 assert elem in _DTD_OPT, "Element %s (attribute %s) not " \ 745 "found" % (elem, attr) 746 else: 747 if elem == 'BlastOutput_version': 748 content = '%s %s' % (qresult.program.upper(), 749 qresult.version) 750 elif qresult._blast_id: 751 if elem == 'BlastOutput_query-ID': 752 content = qresult._blast_id 753 elif elem == 'BlastOutput_query-def': 754 content = ' '.join([qresult.id, 755 qresult.description]).strip() 756 xml.simpleElement(elem, content)
757
758 - def _write_param(self, qresult):
759 """Writes the parameter block of the preamble.""" 760 xml = self.xml 761 xml.startParent('Parameters') 762 self._write_elem_block('Parameters_', 'param', qresult) 763 xml.endParent()
764
765 - def _write_qresults(self, qresults):
766 """Writes QueryResult objects into iteration elements.""" 767 xml = self.xml 768 769 for num, qresult in enumerate(qresults): 770 xml.startParent('Iteration') 771 xml.simpleElement('Iteration_iter-num', str(num+1)) 772 opt_dict = {} 773 # use custom Iteration_query-ID and Iteration_query-def mapping 774 # if the query has a BLAST-generated ID 775 if qresult._blast_id: 776 opt_dict = { 777 'Iteration_query-ID': qresult._blast_id, 778 'Iteration_query-def': ' '.join([qresult.id, 779 qresult.description]).strip(), 780 } 781 self._write_elem_block('Iteration_', 'qresult', qresult, opt_dict) 782 # the Iteration_hits tag only has children if there are hits 783 if qresult: 784 xml.startParent('Iteration_hits') 785 self._write_hits(qresult.hits) 786 xml.endParent() 787 # otherwise it's a simple element without any contents 788 else: 789 xml.simpleElement('Iteration_hits', '') 790 791 xml.startParents('Iteration_stat', 'Statistics') 792 self._write_elem_block('Statistics_', 'stat', qresult) 793 xml.endParents(2) 794 # there's a message if no hits is present 795 if not qresult: 796 xml.simpleElement('Iteration_message', 'No hits found') 797 self.qresult_counter += 1 798 xml.endParent()
799
800 - def _write_hits(self, hits):
801 """Writes Hit objects.""" 802 xml = self.xml 803 804 for num, hit in enumerate(hits): 805 xml.startParent('Hit') 806 xml.simpleElement('Hit_num', str(num+1)) 807 # use custom hit_id and hit_def mapping if the hit has a 808 # BLAST-generated ID 809 opt_dict = {} 810 if hit._blast_id: 811 opt_dict = { 812 'Hit_id': hit._blast_id, 813 'Hit_def': ' '.join([hit.id, hit.description]).strip(), 814 } 815 self._write_elem_block('Hit_', 'hit', hit, opt_dict) 816 xml.startParent('Hit_hsps') 817 self._write_hsps(hit.hsps) 818 self.hit_counter += 1 819 xml.endParents(2)
820
821 - def _write_hsps(self, hsps):
822 """Writes HSP objects.""" 823 xml = self.xml 824 for num, hsp in enumerate(hsps): 825 xml.startParent('Hsp') 826 xml.simpleElement('Hsp_num', str(num+1)) 827 for elem, attr in _WRITE_MAPS['hsp']: 828 elem = 'Hsp_' + elem 829 try: 830 content = self._adjust_output(hsp, elem, attr) 831 # make sure any elements that is not present is optional 832 # in the DTD 833 except AttributeError: 834 assert elem in _DTD_OPT, "Element %s (attribute %s) not found" \ 835 % (elem, attr) 836 else: 837 xml.simpleElement(elem, str(content)) 838 self.hsp_counter += 1 839 self.frag_counter += len(hsp.fragments) 840 xml.endParent()
841
842 - def _adjust_output(self, hsp, elem, attr):
843 """Adjusts output to mimic native BLAST+ XML as much as possible.""" 844 845 # adjust coordinates 846 if attr in ('query_start', 'query_end', 'hit_start', 'hit_end', 847 'pattern_start', 'pattern_end'): 848 content = getattr(hsp, attr) + 1 849 if '_start' in attr: 850 content = getattr(hsp, attr) + 1 851 else: 852 content = getattr(hsp, attr) 853 854 # adjust for 'from' <--> 'to' flip if it's not a translated search 855 # and frames are different 856 # adapted from /src/algo/blast/format/blastxml_format.cpp#L216 857 if hsp.query_frame != 0 and hsp.hit_frame < 0: 858 if attr == 'hit_start': 859 content = getattr(hsp, 'hit_end') 860 elif attr == 'hit_end': 861 content = getattr(hsp, 'hit_start') + 1 862 863 # for seqrecord objects, we only need the sequence string 864 elif elem in ('Hsp_hseq', 'Hsp_qseq'): 865 content = str(getattr(hsp, attr).seq) 866 elif elem == 'Hsp_midline': 867 content = hsp.aln_annotation['homology'] 868 elif elem in ('Hsp_evalue', 'Hsp_bit-score'): 869 # adapted from src/algo/blast/format/blastxml_format.cpp#L138-140 870 content = '%.*g' % (6, getattr(hsp, attr)) 871 else: 872 content = getattr(hsp, attr) 873 874 return content
875 876 877 # if not used as a module, run the doctest 878 if __name__ == "__main__": 879 from Bio._utils import run_doctest 880 run_doctest() 881