Package Bio :: Package SearchIO :: Package BlastIO :: Module blast_xml
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO.BlastIO.blast_xml

  1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7   
  8  """Bio.SearchIO parser for BLAST+ XML output formats.""" 
  9  # for more info: http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd 
 10   
 11  import sys 
 12  import re 
 13  import warnings 
 14  from itertools import chain 
 15  from xml.sax.saxutils import XMLGenerator, escape 
 16   
 17  from Bio import BiopythonParserWarning 
 18   
 19  # For speed try to use cElementTree rather than ElementTree 
 20  try: 
 21      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 22          # Workaround for bug in python 3.0 and 3.1, 
 23          # see http://bugs.python.org/issue9257 
 24          from xml.etree import ElementTree as ElementTree 
 25      else: 
 26          from xml.etree import cElementTree as ElementTree 
 27  except ImportError: 
 28      from xml.etree import ElementTree as ElementTree 
 29   
 30   
 31  from Bio.Alphabet import generic_dna, generic_protein 
 32  from Bio.SearchIO._index import SearchIndexer 
 33  from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment 
 34   
 35  from Bio._py3k import _as_bytes, _bytes_to_string, unicode 
 36  _empty_bytes_string = _as_bytes("") 
 37   
 38  __all__ = ('BlastXmlParser', 'BlastXmlIndexer', 'BlastXmlWriter') 
 39   
 40   
 41  # element - optional qresult attribute name mapping 
 42  _ELEM_QRESULT_OPT = { 
 43      'Statistics_db-num': ('stat_db_num', int), 
 44      'Statistics_db-len': ('stat_db_len', int), 
 45      'Statistics_eff-space': ('stat_eff_space', float), 
 46      'Statistics_hsp-len': ('stat_hsp_len', int), 
 47      'Statistics_kappa': ('stat_kappa', float), 
 48      'Statistics_lambda': ('stat_lambda', float), 
 49      'Statistics_entropy': ('stat_entropy', float), 
 50  } 
 51  # element - hit attribute name mapping 
 52  _ELEM_HIT = { 
 53      # 'Hit_def': ('description', str),   # not set by this dict 
 54      'Hit_accession': ('accession', str), 
 55      'Hit_len': ('seq_len', int), 
 56  } 
 57  # element - hsp attribute name mapping 
 58  _ELEM_HSP = { 
 59      'Hsp_bit-score': ('bitscore', float), 
 60      'Hsp_score': ('bitscore_raw', int), 
 61      'Hsp_evalue': ('evalue', float), 
 62      'Hsp_identity': ('ident_num', int), 
 63      'Hsp_positive': ('pos_num', int), 
 64      'Hsp_gaps': ('gap_num', int), 
 65      'Hsp_density': ('density', float), 
 66  } 
 67  # element - fragment attribute name mapping 
 68  _ELEM_FRAG = { 
 69      'Hsp_query-from': ('query_start', int), 
 70      'Hsp_query-to': ('query_end', int), 
 71      'Hsp_hit-from': ('hit_start', int), 
 72      'Hsp_hit-to': ('hit_end', int), 
 73      'Hsp_query-frame': ('query_frame', int), 
 74      'Hsp_hit-frame': ('hit_frame', int), 
 75      'Hsp_align-len': ('aln_span', int), 
 76      'Hsp_pattern-from': ('pattern_start', int), 
 77      'Hsp_pattern-to': ('pattern_end', int), 
 78      'Hsp_hseq': ('hit', str), 
 79      'Hsp_qseq': ('query', str), 
 80  } 
 81  # dictionary for mapping tag name and meta key name 
 82  _ELEM_META = { 
 83      'BlastOutput_db': ('target', str), 
 84      'BlastOutput_program': ('program', str), 
 85      'BlastOutput_version': ('version', str), 
 86      'BlastOutput_reference': ('reference', str), 
 87      'Parameters_expect': ('param_evalue_threshold', float), 
 88      'Parameters_entrez-query': ('param_entrez_query', str), 
 89      'Parameters_filter': ('param_filter', str), 
 90      'Parameters_gap-extend': ('param_gap_extend', int), 
 91      'Parameters_gap-open': ('param_gap_open', int), 
 92      'Parameters_include': ('param_include', str), 
 93      'Parameters_matrix': ('param_matrix', str), 
 94      'Parameters_pattern': ('param_pattern', str), 
 95      'Parameters_sc-match': ('param_score_match', int), 
 96      'Parameters_sc-mismatch': ('param_score_mismatch', int), 
 97  } 
 98  # these are fallback tags that store information on the first query 
 99  # outside the <Iteration> tag 
100  # only used if query_{ID,def,len} is not found in <Iteration> 
101  # (seen in legacy Blast <2.2.14) 
102  _ELEM_QRESULT_FALLBACK = { 
103      'BlastOutput_query-ID': ('id', str), 
104      'BlastOutput_query-def': ('description', str), 
105      'BlastOutput_query-len': ('len', str), 
106  } 
107  # element-attribute maps, for writing 
108  _WRITE_MAPS = { 
109      'preamble': ( 
110          ('program', 'program'), 
111          ('version', 'version'), 
112          ('reference', 'reference'), 
113          ('db', 'target'), 
114          ('query-ID', 'id'), 
115          ('query-def', 'description'), 
116          ('query-len', 'seq_len'), 
117          ('param', None), 
118      ), 
119      'param': ( 
120          ('matrix', 'param_matrix'), 
121          ('expect', 'param_evalue_threshold'), 
122          ('sc-match', 'param_score_match'), 
123          ('sc-mismatch', 'param_score_mismatch'), 
124          ('gap-open', 'param_gap_open'), 
125          ('gap-extend', 'param_gap_extend'), 
126          ('filter', 'param_filter'), 
127          ('pattern', 'param_pattern'), 
128          ('entrez-query', 'param_entrez_query'), 
129      ), 
130      'qresult': ( 
131          ('query-ID', 'id'), 
132          ('query-def', 'description'), 
133          ('query-len', 'seq_len'), 
134      ), 
135      'stat': ( 
136          ('db-num', 'stat_db_num'), 
137          ('db-len', 'stat_db_len'), 
138          ('hsp-len', 'stat_hsp_len'), 
139          ('eff-space', 'stat_eff_space'), 
140          ('kappa', 'stat_kappa'), 
141          ('lambda', 'stat_lambda'), 
142          ('entropy', 'stat_entropy'), 
143      ), 
144      'hit': ( 
145          ('id', 'id'), 
146          ('def', 'description'), 
147          ('accession', 'accession'), 
148          ('len', 'seq_len'), 
149      ), 
150      'hsp': ( 
151          ('bit-score', 'bitscore'), 
152          ('score', 'bitscore_raw'), 
153          ('evalue', 'evalue'), 
154          ('query-from', 'query_start'), 
155          ('query-to', 'query_end'), 
156          ('hit-from', 'hit_start'), 
157          ('hit-to', 'hit_end'), 
158          ('pattern-from', 'pattern_start'), 
159          ('pattern-to', 'pattern_end'), 
160          ('query-frame', 'query_frame'), 
161          ('hit-frame', 'hit_frame'), 
162          ('identity', 'ident_num'), 
163          ('positive', 'pos_num'), 
164          ('gaps', 'gap_num'), 
165          ('align-len', 'aln_span'), 
166          ('density', 'density'), 
167          ('qseq', 'query'), 
168          ('hseq', 'hit'), 
169          ('midline', None), 
170      ), 
171  } 
172  # optional elements, based on the DTD 
173  _DTD_OPT = ( 
174      'BlastOutput_query-seq', 'BlastOutput_mbstat', 'Iteration_query-def', 
175      'Iteration_query-len', 'Iteration-hits', 'Iteration_stat', 
176      'Iteration_message', 'Parameters_matrix', 'Parameters_include', 
177      'Parameters_sc-match', 'Parameters_sc-mismatch', 'Parameters_filter', 
178      'Parameters_pattern', 'Parameters_entrez-query', 'Hit_hsps', 
179      'Hsp_pattern-from', 'Hsp_pattern-to', 'Hsp_query-frame', 'Hsp_hit-frame', 
180      'Hsp_identity', 'Hsp_positive', 'Hsp_gaps', 'Hsp_align-len', 'Hsp_density', 
181      'Hsp_midline', 
182  ) 
183   
184  # compile RE patterns 
185  # for capturing BLAST version 
186  _RE_VERSION = re.compile(r'\d+\.\d+\.\d+\+?') 
187  # for splitting ID-description pairs 
188  _RE_ID_DESC_PAIRS_PATTERN = re.compile(" +>") 
189  # for splitting ID and description (must be used with maxsplit = 1) 
190  _RE_ID_DESC_PATTERN = re.compile(" +") 
191   
192   
193 -def _extract_ids_and_descs(raw_id, raw_desc):
194 """Extract IDs, descriptions, and raw ID from raw values (PRIVATE). 195 196 Given values of the `Hit_id` and `Hit_def` elements, this function returns 197 a tuple of three elements: all IDs, all descriptions, and the 198 BLAST-generated ID. The BLAST-generated ID is set to `None` if no 199 BLAST-generated IDs are present. 200 201 """ 202 ids = [] 203 descs = [] 204 205 blast_gen_id = raw_id 206 if raw_id.startswith('gnl|BL_ORD_ID|'): 207 id_desc_line = raw_desc 208 else: 209 id_desc_line = raw_id + ' ' + raw_desc 210 211 # create a list of lists, each list containing an ID and description 212 # or just an ID, if description is not present 213 id_desc_pairs = [re.split(_RE_ID_DESC_PATTERN, x, 1) 214 for x in re.split(_RE_ID_DESC_PAIRS_PATTERN, id_desc_line)] 215 # make sure empty descriptions are added as empty strings 216 # also, we return lists for compatibility reasons between Py2 and Py3 217 add_descs = lambda x: x if len(x) == 2 else x + [""] 218 for pair in (add_descs(p) for p in id_desc_pairs): 219 ids.append(pair[0]) 220 descs.append(pair[1]) 221 222 return (ids, descs, blast_gen_id)
223 224
225 -class BlastXmlParser(object):
226 """Parser for the BLAST XML format.""" 227
228 - def __init__(self, handle, use_raw_query_ids=False, use_raw_hit_ids=False):
229 """Initialize the class.""" 230 self.xml_iter = iter(ElementTree.iterparse(handle, events=('start', 'end'))) 231 self._use_raw_query_ids = use_raw_query_ids 232 self._use_raw_hit_ids = use_raw_hit_ids 233 self._meta, self._fallback = self._parse_preamble()
234
235 - def __iter__(self):
236 for qresult in self._parse_qresult(): 237 yield qresult
238
239 - def _parse_preamble(self):
240 """Parse all tag data prior to the first query result (PRIVATE).""" 241 # dictionary for containing all information prior to the first query 242 meta = {} 243 # dictionary for fallback information 244 fallback = {} 245 246 # parse the preamble part (anything prior to the first result) 247 for event, elem in self.xml_iter: 248 # get the tag values, cast appropriately, store into meta 249 if event == 'end' and elem.tag in _ELEM_META: 250 attr_name, caster = _ELEM_META[elem.tag] 251 252 if caster is not str: 253 meta[attr_name] = caster(elem.text) 254 else: 255 meta[attr_name] = elem.text 256 257 # delete element after we finish parsing it 258 elem.clear() 259 continue 260 # capture fallback values 261 # these are used only if the first <Iteration> does not have any 262 # ID, ref, or len. 263 elif event == 'end' and elem.tag in _ELEM_QRESULT_FALLBACK: 264 attr_name, caster = _ELEM_QRESULT_FALLBACK[elem.tag] 265 266 if caster is not str: 267 fallback[attr_name] = caster(elem.text) 268 else: 269 fallback[attr_name] = elem.text 270 271 elem.clear() 272 continue 273 274 if event == 'start' and elem.tag == 'Iteration': 275 break 276 277 # we only want the version number, sans the program name or date 278 if meta.get('version') is not None: 279 meta['version'] = re.search(_RE_VERSION, 280 meta['version']).group(0) 281 282 return meta, fallback
283
284 - def _parse_qresult(self):
285 """Parse query results (PRIVATE).""" 286 # parse the queries 287 for event, qresult_elem in self.xml_iter: 288 # </Iteration> marks the end of a single query 289 # which means we can process it 290 if event == 'end' and qresult_elem.tag == 'Iteration': 291 292 # we'll use the following schema 293 # <!ELEMENT Iteration ( 294 # Iteration_iter-num, 295 # Iteration_query-ID?, 296 # Iteration_query-def?, 297 # Iteration_query-len?, 298 # Iteration_hits?, 299 # Iteration_stat?, 300 # Iteration_message?)> 301 302 # assign query attributes with fallbacks 303 query_id = qresult_elem.findtext('Iteration_query-ID') 304 if query_id is None: 305 query_id = self._fallback['id'] 306 307 query_desc = qresult_elem.findtext('Iteration_query-def') 308 if query_desc is None: 309 query_desc = self._fallback['description'] 310 311 query_len = qresult_elem.findtext('Iteration_query-len') 312 if query_len is None: 313 query_len = self._fallback['len'] 314 315 blast_query_id = query_id 316 # handle blast searches against databases with Blast's IDs 317 # 'Query_' marks the beginning of a BLAST+-generated ID, 318 # 'lcl|' marks the beginning of a BLAST legacy-generated ID 319 if not self._use_raw_query_ids and \ 320 (query_id.startswith('Query_') or query_id.startswith('lcl|')): 321 # store the Blast-generated query ID 322 id_desc = query_desc.split(' ', 1) 323 query_id = id_desc[0] 324 try: 325 query_desc = id_desc[1] 326 except IndexError: 327 query_desc = '' 328 329 hit_list, key_list = [], [] 330 for hit in self._parse_hit(qresult_elem.find('Iteration_hits'), 331 query_id): 332 if hit: 333 # need to keep track of hit IDs, since there could be duplicates, 334 if hit.id in key_list: 335 warnings.warn("Renaming hit ID %r to a BLAST-generated ID " 336 "%r since the ID was already matched " 337 "by your query %r. Your BLAST database may contain " 338 "duplicate entries." % 339 (hit.id, hit.blast_id, query_id), BiopythonParserWarning) 340 # fallback to Blast-generated IDs, if the ID is already present 341 # and restore the desc, too 342 hit.description = '%s %s' % (hit.id, hit.description) 343 hit.id = hit.blast_id 344 # and change the hit_id of the HSPs contained 345 for hsp in hit: 346 hsp.hit_id = hit.blast_id 347 else: 348 key_list.append(hit.id) 349 350 hit_list.append(hit) 351 352 # create qresult and assign its attributes 353 qresult = QueryResult(hit_list, query_id) 354 qresult.description = query_desc 355 qresult.seq_len = int(query_len) 356 qresult.blast_id = blast_query_id 357 for key, value in self._meta.items(): 358 setattr(qresult, key, value) 359 360 # statistics are stored in Iteration_stat's 'grandchildren' with the 361 # following DTD 362 # <!ELEMENT Statistics ( 363 # Statistics_db-num, 364 # Statistics_db-len, 365 # Statistics_hsp-len, 366 # Statistics_eff-space, 367 # Statistics_kappa, 368 # Statistics_lambda, 369 # Statistics_entropy)> 370 371 stat_iter_elem = qresult_elem.find('Iteration_stat') 372 if stat_iter_elem is not None: 373 stat_elem = stat_iter_elem.find('Statistics') 374 375 for key, val_info in _ELEM_QRESULT_OPT.items(): 376 value = stat_elem.findtext(key) 377 if value is not None: 378 caster = val_info[1] 379 # recast only if value is not intended to be str 380 if value is not None and caster is not str: 381 value = caster(value) 382 setattr(qresult, val_info[0], value) 383 384 # delete element after we finish parsing it 385 qresult_elem.clear() 386 yield qresult
387
388 - def _parse_hit(self, root_hit_elem, query_id):
389 """Yield a generator object that transforms Iteration_hits XML elements into Hit objects (PRIVATE). 390 391 :param root_hit_elem: root element of the Iteration_hits tag. 392 :type root_hit_elem: XML element tag 393 :param query_id: QueryResult ID of this Hit 394 :type query_id: string 395 396 """ 397 # Hit level processing 398 # Hits are stored in the Iteration_hits tag, with the following 399 # DTD 400 # <!ELEMENT Hit ( 401 # Hit_num, 402 # Hit_id, 403 # Hit_def, 404 # Hit_accession, 405 # Hit_len, 406 # Hit_hsps?)> 407 408 # feed the loop below an empty list so iteration still works 409 if root_hit_elem is None: 410 root_hit_elem = [] 411 412 for hit_elem in root_hit_elem: 413 414 # BLAST sometimes mangles the sequence IDs and descriptions, so we need 415 # to extract the actual values. 416 raw_hit_id = hit_elem.findtext('Hit_id') 417 raw_hit_desc = hit_elem.findtext('Hit_def') 418 if not self._use_raw_hit_ids: 419 ids, descs, blast_hit_id = _extract_ids_and_descs(raw_hit_id, raw_hit_desc) 420 else: 421 ids, descs, blast_hit_id = [raw_hit_id], [raw_hit_desc], raw_hit_id 422 423 hit_id, alt_hit_ids = ids[0], ids[1:] 424 hit_desc, alt_hit_descs = descs[0], descs[1:] 425 426 hsps = [hsp for hsp in 427 self._parse_hsp(hit_elem.find('Hit_hsps'), 428 query_id, hit_id)] 429 430 hit = Hit(hsps) 431 hit.description = hit_desc 432 hit._id_alt = alt_hit_ids 433 hit._description_alt = alt_hit_descs 434 hit.blast_id = blast_hit_id 435 436 for key, val_info in _ELEM_HIT.items(): 437 value = hit_elem.findtext(key) 438 if value is not None: 439 caster = val_info[1] 440 # recast only if value is not intended to be str 441 if value is not None and caster is not str: 442 value = caster(value) 443 setattr(hit, val_info[0], value) 444 445 # delete element after we finish parsing it 446 hit_elem.clear() 447 yield hit
448
449 - def _parse_hsp(self, root_hsp_frag_elem, query_id, hit_id):
450 """Yield a generator object that transforms Hit_hsps XML elements into HSP objects (PRIVATE). 451 452 :param root_hsp_frag_elem: the ``Hit_hsps`` tag 453 :type root_hsp_frag_elem: XML element tag 454 :param query_id: query ID 455 :type query_id: string 456 :param hit_id: hit ID 457 :type hit_id: string 458 459 """ 460 # Hit_hsps DTD: 461 # <!ELEMENT Hsp ( 462 # Hsp_num, 463 # Hsp_bit-score, 464 # Hsp_score, 465 # Hsp_evalue, 466 # Hsp_query-from, 467 # Hsp_query-to, 468 # Hsp_hit-from, 469 # Hsp_hit-to, 470 # Hsp_pattern-from?, 471 # Hsp_pattern-to?, 472 # Hsp_query-frame?, 473 # Hsp_hit-frame?, 474 # Hsp_identity?, 475 # Hsp_positive?, 476 # Hsp_gaps?, 477 # Hsp_align-len?, 478 # Hsp_density?, 479 # Hsp_qseq, 480 # Hsp_hseq, 481 # Hsp_midline?)> 482 483 # if value is None, feed the loop below an empty list 484 if root_hsp_frag_elem is None: 485 root_hsp_frag_elem = [] 486 487 for hsp_frag_elem in root_hsp_frag_elem: 488 coords = {} # temporary container for coordinates 489 frag = HSPFragment(hit_id, query_id) 490 for key, val_info in _ELEM_FRAG.items(): 491 value = hsp_frag_elem.findtext(key) 492 caster = val_info[1] 493 494 # adjust 'from' and 'to' coordinates to 0-based ones 495 if value is not None: 496 if key.endswith('-from') or key.endswith('-to'): 497 # store coordinates for further processing 498 coords[val_info[0]] = caster(value) 499 continue 500 # recast only if value is not intended to be str 501 elif caster is not str: 502 value = caster(value) 503 setattr(frag, val_info[0], value) 504 505 # set the similarity characters into aln_annotation dict 506 frag.aln_annotation['similarity'] = hsp_frag_elem.findtext('Hsp_midline') 507 508 # process coordinates 509 # since 'x-from' could be bigger than 'x-to', we need to figure 510 # out which one is smaller/bigger since 'x_start' is always smaller 511 # than 'x_end' 512 for coord_type in ('query', 'hit', 'pattern'): 513 start_type = coord_type + '_start' 514 end_type = coord_type + '_end' 515 try: 516 start = coords[start_type] 517 end = coords[end_type] 518 except KeyError: 519 continue 520 else: 521 # convert to python range and setattr 522 setattr(frag, start_type, min(start, end) - 1) 523 setattr(frag, end_type, max(start, end)) 524 525 # set alphabet, based on program 526 prog = self._meta.get('program') 527 if prog == 'blastn': 528 frag.alphabet = generic_dna 529 elif prog in ['blastp', 'blastx', 'tblastn', 'tblastx']: 530 frag.alphabet = generic_protein 531 532 hsp = HSP([frag]) 533 for key, val_info in _ELEM_HSP.items(): 534 value = hsp_frag_elem.findtext(key) 535 caster = val_info[1] 536 if value is not None: 537 if caster is not str: 538 value = caster(value) 539 setattr(hsp, val_info[0], value) 540 # delete element after we finish parsing it 541 hsp_frag_elem.clear() 542 yield hsp
543 544
545 -class BlastXmlIndexer(SearchIndexer):
546 """Indexer class for BLAST XML output.""" 547 548 _parser = BlastXmlParser 549 qstart_mark = _as_bytes('<Iteration>') 550 qend_mark = _as_bytes('</Iteration>') 551 block_size = 16384 552
553 - def __init__(self, filename, **kwargs):
554 """Initialize the class.""" 555 SearchIndexer.__init__(self, filename) 556 # TODO: better way to do this? 557 iter_obj = self._parser(self._handle, **kwargs) 558 self._meta, self._fallback = iter_obj._meta, iter_obj._fallback
559
560 - def __iter__(self):
561 qstart_mark = self.qstart_mark 562 qend_mark = self.qend_mark 563 blast_id_mark = _as_bytes('Query_') 564 block_size = self.block_size 565 handle = self._handle 566 handle.seek(0) 567 re_desc = re.compile(_as_bytes(r'<Iteration_query-ID>(.*?)' 568 '</Iteration_query-ID>\s+?<Iteration_query-def>' 569 '(.*?)</Iteration_query-def>')) 570 re_desc_end = re.compile(_as_bytes(r'</Iteration_query-def>')) 571 counter = 0 572 573 while True: 574 start_offset = handle.tell() 575 line = handle.readline() 576 if not line: 577 break 578 if qstart_mark not in line: 579 continue 580 # The following requirements are to make supporting BGZF compressed 581 # BLAST XML files simpler (avoids complex offset manipulations): 582 assert line.count(qstart_mark) == 1, "XML without line breaks?" 583 assert line.lstrip().startswith(qstart_mark), line 584 if qend_mark in line: 585 # Should cope with <Iteration>...</Iteration> on one long line 586 block = line 587 else: 588 # Load the rest of this block up to and including </Iteration> 589 block = [line] 590 while line and qend_mark not in line: 591 line = handle.readline() 592 assert qstart_mark not in line, line 593 block.append(line) 594 assert line.rstrip().endswith(qend_mark), line 595 block = _empty_bytes_string.join(block) 596 assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block 597 assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block 598 # Now we have a full <Iteration>...</Iteration> block, find the ID 599 regx = re.search(re_desc, block) 600 try: 601 qstart_desc = regx.group(2) 602 qstart_id = regx.group(1) 603 except AttributeError: 604 # use the fallback values 605 assert re.search(re_desc_end, block) 606 qstart_desc = _as_bytes(self._fallback['description']) 607 qstart_id = _as_bytes(self._fallback['id']) 608 if qstart_id.startswith(blast_id_mark): 609 qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0] 610 yield _bytes_to_string(qstart_id), start_offset, len(block) 611 counter += 1
612
613 - def _parse(self, handle):
614 # overwrites SearchIndexer._parse, since we need to set the meta and 615 # fallback dictionaries to the parser 616 generator = self._parser(handle, **self._kwargs) 617 generator._meta = self._meta 618 generator._fallback = self._fallback 619 return next(iter(generator))
620
621 - def get_raw(self, offset):
622 """Return the raw record from the file as a bytes string.""" 623 qend_mark = self.qend_mark 624 handle = self._handle 625 handle.seek(offset) 626 627 qresult_raw = handle.readline() 628 assert qresult_raw.lstrip().startswith(self.qstart_mark) 629 while qend_mark not in qresult_raw: 630 qresult_raw += handle.readline() 631 assert qresult_raw.rstrip().endswith(qend_mark) 632 assert qresult_raw.count(qend_mark) == 1 633 # Note this will include any leading and trailing whitespace, in 634 # general expecting " <Iteration>\n...\n </Iteration>\n" 635 return qresult_raw
636 637
638 -class _BlastXmlGenerator(XMLGenerator):
639 """Event-based XML Generator.""" 640
641 - def __init__(self, out, encoding='utf-8', indent=" ", increment=2):
642 XMLGenerator.__init__(self, out, encoding) 643 # the indentation character 644 self._indent = indent 645 # nest level 646 self._level = 0 647 # how many indentation character should we increment per level 648 self._increment = increment 649 # container for names of tags with children 650 self._parent_stack = [] 651 # determine writer method 652 try: 653 # this should work for all platforms except Jython 654 self.write = self._write 655 except AttributeError: 656 # Jython uses self._out.write 657 self.write = self._out.write
658
659 - def startDocument(self):
660 """Start the XML document.""" 661 self.write(u'<?xml version="1.0"?>\n' 662 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' 663 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n')
664
665 - def startElement(self, name, attrs=None, children=False):
666 """Start an XML element. 667 668 :param name: element name 669 :type name: string 670 :param attrs: element attributes 671 :type attrs: dictionary {string: object} 672 :param children: whether the element has children or not 673 :type children: bool 674 675 """ 676 if attrs is None: 677 attrs = {} 678 self.ignorableWhitespace(self._indent * self._level) 679 XMLGenerator.startElement(self, name, attrs)
680
681 - def endElement(self, name):
682 """End and XML element of the given name.""" 683 XMLGenerator.endElement(self, name) 684 self.write(u'\n')
685
686 - def startParent(self, name, attrs=None):
687 """Start an XML element which has children. 688 689 :param name: element name 690 :type name: string 691 :param attrs: element attributes 692 :type attrs: dictionary {string: object} 693 694 """ 695 if attrs is None: 696 attrs = {} 697 self.startElement(name, attrs, children=True) 698 self._level += self._increment 699 self.write(u'\n') 700 # append the element name, so we can end it later 701 self._parent_stack.append(name)
702
703 - def endParent(self):
704 """End an XML element with children.""" 705 # the element to end is the one on top of the stack 706 name = self._parent_stack.pop() 707 self._level -= self._increment 708 self.ignorableWhitespace(self._indent * self._level) 709 self.endElement(name)
710
711 - def startParents(self, *names):
712 """Start XML elements without children.""" 713 for name in names: 714 self.startParent(name)
715
716 - def endParents(self, num):
717 """End XML elements, according to the given number.""" 718 for i in range(num): 719 self.endParent()
720
721 - def simpleElement(self, name, content=None):
722 """Create an XML element without children with the given content.""" 723 self.startElement(name, attrs={}) 724 if content: 725 self.characters(content) 726 self.endElement(name)
727
728 - def characters(self, content):
729 content = escape(unicode(content)) 730 for a, b in ((u'"', u'&quot;'), (u"'", u'&apos;')): 731 content = content.replace(a, b) 732 self.write(content)
733 734
735 -class BlastXmlWriter(object):
736 """Stream-based BLAST+ XML Writer.""" 737
738 - def __init__(self, handle, use_raw_query_ids=True, use_raw_hit_ids=True):
739 """Initialize the class.""" 740 self.xml = _BlastXmlGenerator(handle, 'utf-8') 741 self._use_raw_query_ids = use_raw_query_ids 742 self._use_raw_hit_ids = use_raw_hit_ids
743
744 - def write_file(self, qresults):
745 """Write the XML contents to the output handle.""" 746 xml = self.xml 747 self.qresult_counter, self.hit_counter, self.hsp_counter, \ 748 self.frag_counter = 0, 0, 0, 0 749 750 # get the first qresult, since the preamble requires its attr values 751 first_qresult = next(qresults) 752 # start the XML document, set the root element, and create the preamble 753 xml.startDocument() 754 xml.startParent('BlastOutput') 755 self._write_preamble(first_qresult) 756 # and write the qresults 757 xml.startParent('BlastOutput_iterations') 758 self._write_qresults(chain([first_qresult], qresults)) 759 xml.endParents(2) 760 xml.endDocument() 761 762 return self.qresult_counter, self.hit_counter, self.hsp_counter, \ 763 self.frag_counter
764
765 - def _write_elem_block(self, block_name, map_name, obj, opt_dict=None):
766 """Write sibling XML elements (PRIVATE). 767 768 :param block_name: common element name prefix 769 :type block_name: string 770 :param map_name: name of mapping between element and attribute names 771 :type map_name: string 772 :param obj: object whose attribute value will be used 773 :type obj: object 774 :param opt_dict: custom element-attribute mapping 775 :type opt_dict: dictionary {string: string} 776 777 """ 778 if opt_dict is None: 779 opt_dict = {} 780 for elem, attr in _WRITE_MAPS[map_name]: 781 elem = block_name + elem 782 try: 783 content = str(getattr(obj, attr)) 784 except AttributeError: 785 # ensure attrs that is not present is optional 786 assert elem in _DTD_OPT, "Element %r (attribute %r) not " \ 787 "found" % (elem, attr) 788 else: 789 # custom element-attribute mapping, for fallback values 790 if elem in opt_dict: 791 content = opt_dict[elem] 792 self.xml.simpleElement(elem, content)
793
794 - def _write_preamble(self, qresult):
795 """Write the XML file preamble (PRIVATE).""" 796 xml = self.xml 797 798 for elem, attr in _WRITE_MAPS['preamble']: 799 elem = 'BlastOutput_' + elem 800 if elem == 'BlastOutput_param': 801 xml.startParent(elem) 802 self._write_param(qresult) 803 xml.endParent() 804 continue 805 try: 806 content = str(getattr(qresult, attr)) 807 except AttributeError: 808 assert elem in _DTD_OPT, "Element %s (attribute %s) not " \ 809 "found" % (elem, attr) 810 else: 811 if elem == 'BlastOutput_version': 812 content = '%s %s' % (qresult.program.upper(), 813 qresult.version) 814 elif qresult.blast_id: 815 if elem == 'BlastOutput_query-ID': 816 content = qresult.blast_id 817 elif elem == 'BlastOutput_query-def': 818 content = ' '.join([qresult.id, 819 qresult.description]).strip() 820 xml.simpleElement(elem, content)
821
822 - def _write_param(self, qresult):
823 """Write the parameter block of the preamble (PRIVATE).""" 824 xml = self.xml 825 xml.startParent('Parameters') 826 self._write_elem_block('Parameters_', 'param', qresult) 827 xml.endParent()
828
829 - def _write_qresults(self, qresults):
830 """Write QueryResult objects into iteration elements (PRIVATE).""" 831 xml = self.xml 832 833 for num, qresult in enumerate(qresults): 834 xml.startParent('Iteration') 835 xml.simpleElement('Iteration_iter-num', str(num + 1)) 836 opt_dict = {} 837 if self._use_raw_query_ids: 838 query_id = qresult.blast_id 839 query_desc = qresult.id + ' ' + qresult.description 840 else: 841 query_id = qresult.id 842 query_desc = qresult.description 843 844 opt_dict = { 845 'Iteration_query-ID': query_id, 846 'Iteration_query-def': query_desc, 847 } 848 self._write_elem_block('Iteration_', 'qresult', qresult, opt_dict) 849 # the Iteration_hits tag only has children if there are hits 850 if qresult: 851 xml.startParent('Iteration_hits') 852 self._write_hits(qresult.hits) 853 xml.endParent() 854 # otherwise it's a simple element without any contents 855 else: 856 xml.simpleElement('Iteration_hits', '') 857 858 xml.startParents('Iteration_stat', 'Statistics') 859 self._write_elem_block('Statistics_', 'stat', qresult) 860 xml.endParents(2) 861 # there's a message if no hits is present 862 if not qresult: 863 xml.simpleElement('Iteration_message', 'No hits found') 864 self.qresult_counter += 1 865 xml.endParent()
866
867 - def _write_hits(self, hits):
868 """Write Hit objects (PRIVATE).""" 869 xml = self.xml 870 871 for num, hit in enumerate(hits): 872 xml.startParent('Hit') 873 xml.simpleElement('Hit_num', str(num + 1)) 874 # use custom hit_id and hit_def mapping if the hit has a 875 # BLAST-generated ID 876 opt_dict = {} 877 878 if self._use_raw_hit_ids: 879 hit_id = hit.blast_id 880 hit_desc = ' >'.join( 881 ['{} {}'.format(x, y) 882 for x, y in zip(hit.id_all, hit.description_all)]) 883 else: 884 hit_id = hit.id 885 hit_desc = hit.description + ' >'.join( 886 ['{} {}'.format(x, y) 887 for x, y in zip(hit.id_all[1:], hit.description_all[1:])]) 888 889 opt_dict = { 890 'Hit_id': hit_id, 891 'Hit_def': hit_desc, 892 } 893 self._write_elem_block('Hit_', 'hit', hit, opt_dict) 894 xml.startParent('Hit_hsps') 895 self._write_hsps(hit.hsps) 896 self.hit_counter += 1 897 xml.endParents(2)
898
899 - def _write_hsps(self, hsps):
900 """Write HSP objects (PRIVATE).""" 901 xml = self.xml 902 for num, hsp in enumerate(hsps): 903 xml.startParent('Hsp') 904 xml.simpleElement('Hsp_num', str(num + 1)) 905 for elem, attr in _WRITE_MAPS['hsp']: 906 elem = 'Hsp_' + elem 907 try: 908 content = self._adjust_output(hsp, elem, attr) 909 # make sure any elements that is not present is optional 910 # in the DTD 911 except AttributeError: 912 assert elem in _DTD_OPT, "Element %s (attribute %s) not found" \ 913 % (elem, attr) 914 else: 915 xml.simpleElement(elem, str(content)) 916 self.hsp_counter += 1 917 self.frag_counter += len(hsp.fragments) 918 xml.endParent()
919
920 - def _adjust_output(self, hsp, elem, attr):
921 """Adjust output to mimic native BLAST+ XML as much as possible (PRIVATE).""" 922 # adjust coordinates 923 if attr in ('query_start', 'query_end', 'hit_start', 'hit_end', 924 'pattern_start', 'pattern_end'): 925 content = getattr(hsp, attr) + 1 926 if '_start' in attr: 927 content = getattr(hsp, attr) + 1 928 else: 929 content = getattr(hsp, attr) 930 931 # adjust for 'from' <--> 'to' flip if it's not a translated search 932 # and frames are different 933 # adapted from /src/algo/blast/format/blastxml_format.cpp#L216 934 if hsp.query_frame != 0 and hsp.hit_frame < 0: 935 if attr == 'hit_start': 936 content = getattr(hsp, 'hit_end') 937 elif attr == 'hit_end': 938 content = getattr(hsp, 'hit_start') + 1 939 940 # for seqrecord objects, we only need the sequence string 941 elif elem in ('Hsp_hseq', 'Hsp_qseq'): 942 content = str(getattr(hsp, attr).seq) 943 elif elem == 'Hsp_midline': 944 content = hsp.aln_annotation['similarity'] 945 elif elem in ('Hsp_evalue', 'Hsp_bit-score'): 946 # adapted from src/algo/blast/format/blastxml_format.cpp#L138-140 947 content = '%.*g' % (6, getattr(hsp, attr)) 948 else: 949 content = getattr(hsp, attr) 950 951 return content
952 953 954 # if not used as a module, run the doctest 955 if __name__ == "__main__": 956 from Bio._utils import run_doctest 957 run_doctest() 958