Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Code to work with the BLAST XML output. 
  8   
  9  The BLAST XML DTD file is on the NCBI FTP site at: 
 10  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 11  """ 
 12  from __future__ import print_function 
 13   
 14  from Bio.Blast import Record 
 15  import xml.sax 
 16  from xml.sax.handler import ContentHandler 
 17  from functools import reduce 
 18   
 19   
20 -class _XMLparser(ContentHandler):
21 """Generic SAX Parser (PRIVATE). 22 23 Just a very basic SAX parser. 24 25 Redefine the methods startElement, characters and endElement. 26 """
27 - def __init__(self, debug=0):
28 """Constructor. 29 30 debug - integer, amount of debug information to print 31 """ 32 self._tag = [] 33 self._value = '' 34 self._debug = debug 35 self._debug_ignore_list = []
36
37 - def _secure_name(self, name):
38 """Removes 'dangerous' from tag names. 39 40 name -- name to be 'secured' 41 """ 42 # Replace '-' with '_' in XML tag names 43 return name.replace('-', '_')
44
45 - def startElement(self, name, attr):
46 """Found XML start tag. 47 48 No real need of attr, BLAST DTD doesn't use them 49 50 name -- name of the tag 51 52 attr -- tag attributes 53 """ 54 self._tag.append(name) 55 56 # Try to call a method (defined in subclasses) 57 method = self._secure_name('_start_' + name) 58 59 # Note could use try / except AttributeError 60 # BUT I found often triggered by nested errors... 61 if hasattr(self, method): 62 getattr(self, method)() 63 if self._debug > 4: 64 print("NCBIXML: Parsed: " + method) 65 elif self._debug > 3: 66 # Doesn't exist (yet) and may want to warn about it 67 if method not in self._debug_ignore_list: 68 print("NCBIXML: Ignored: " + method) 69 self._debug_ignore_list.append(method) 70 71 # We don't care about white space in parent tags like Hsp, 72 # but that white space doesn't belong to child tags like Hsp_midline 73 if self._value.strip(): 74 raise ValueError("What should we do with %s before the %s tag?" 75 % (repr(self._value), name)) 76 self._value = ""
77
78 - def characters(self, ch):
79 """Found some text. 80 81 ch -- characters read 82 """ 83 self._value += ch # You don't ever get the whole string
84
85 - def endElement(self, name):
86 """Found XML end tag. 87 88 name -- tag name 89 """ 90 # DON'T strip any white space, we may need it e.g. the hsp-midline 91 92 # Try to call a method (defined in subclasses) 93 method = self._secure_name('_end_' + name) 94 # Note could use try / except AttributeError 95 # BUT I found often triggered by nested errors... 96 if hasattr(self, method): 97 getattr(self, method)() 98 if self._debug > 2: 99 print("NCBIXML: Parsed: %s %s" % (method, self._value)) 100 elif self._debug > 1: 101 # Doesn't exist (yet) and may want to warn about it 102 if method not in self._debug_ignore_list: 103 print("NCBIXML: Ignored: %s %s" % (method, self._value)) 104 self._debug_ignore_list.append(method) 105 106 # Reset character buffer 107 self._value = ''
108 109
110 -class BlastParser(_XMLparser):
111 """Parse XML BLAST data into a Record.Blast object. 112 113 Parses XML output from BLAST (direct use discouraged). 114 This (now) returns a list of Blast records. 115 Historically it returned a single Blast record. 116 You are expected to use this via the parse or read functions. 117 118 All XML 'action' methods are private methods and may be: 119 - ``_start_TAG`` called when the start tag is found 120 - ``_end_TAG`` called when the end tag is found 121 """ 122
123 - def __init__(self, debug=0):
124 """Constructor. 125 126 debug - integer, amount of debug information to print 127 """ 128 # Calling superclass method 129 _XMLparser.__init__(self, debug) 130 131 self._parser = xml.sax.make_parser() 132 self._parser.setContentHandler(self) 133 134 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 135 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 136 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 137 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 138 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 139 140 self.reset()
141
142 - def reset(self):
143 """Reset all the data allowing reuse of the BlastParser() object.""" 144 self._records = [] 145 self._header = Record.Header() 146 self._parameters = Record.Parameters() 147 self._parameters.filter = None # Maybe I should update the class?
148
149 - def _start_Iteration(self):
150 self._blast = Record.Blast() 151 pass
152
153 - def _end_Iteration(self):
154 # We stored a lot of generic "top level" information 155 # in self._header (an object of type Record.Header) 156 self._blast.reference = self._header.reference 157 self._blast.date = self._header.date 158 self._blast.version = self._header.version 159 self._blast.database = self._header.database 160 self._blast.application = self._header.application 161 162 # These are required for "old" pre 2.2.14 files 163 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 164 # and <BlastOutput_query-len> were used. Now they 165 # are supplemented/replaced by <Iteration_query-ID>, 166 # <Iteration_query-def> and <Iteration_query-len> 167 if not hasattr(self._blast, "query") \ 168 or not self._blast.query: 169 self._blast.query = self._header.query 170 if not hasattr(self._blast, "query_id") \ 171 or not self._blast.query_id: 172 self._blast.query_id = self._header.query_id 173 if not hasattr(self._blast, "query_letters") \ 174 or not self._blast.query_letters: 175 self._blast.query_letters = self._header.query_letters 176 177 # Hack to record the query length as both the query_letters and 178 # query_length properties (as in the plain text parser, see 179 # Bug 2176 comment 12): 180 self._blast.query_length = self._blast.query_letters 181 # Perhaps in the long term we should deprecate one, but I would 182 # prefer to drop query_letters - so we need a transition period 183 # with both. 184 185 # Hack to record the claimed database size as database_length 186 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 187 self._blast.database_length = self._blast.num_letters_in_database 188 # TODO? Deprecate database_letters next? 189 190 # Hack to record the claimed database sequence count as database_sequences 191 self._blast.database_sequences = self._blast.num_sequences_in_database 192 193 # Apply the "top level" parameter information 194 self._blast.matrix = self._parameters.matrix 195 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 196 self._blast.gap_penalties = self._parameters.gap_penalties 197 self._blast.filter = self._parameters.filter 198 self._blast.expect = self._parameters.expect 199 self._blast.sc_match = self._parameters.sc_match 200 self._blast.sc_mismatch = self._parameters.sc_mismatch 201 202 # Add to the list 203 self._records.append(self._blast) 204 # Clear the object (a new empty one is create in _start_Iteration) 205 self._blast = None 206 207 if self._debug: 208 print("NCBIXML: Added Blast record to results")
209 210 # Header
211 - def _end_BlastOutput_program(self):
212 """BLAST program, e.g., blastp, blastn, etc. 213 214 Save this to put on each blast record object 215 """ 216 self._header.application = self._value.upper()
217
218 - def _end_BlastOutput_version(self):
219 """version number and date of the BLAST engine. 220 221 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 222 variants like "BLASTP 2.2.18+" without the date. 223 224 Save this to put on each blast record object 225 """ 226 parts = self._value.split() 227 # TODO - Check the first word starts with BLAST? 228 229 # The version is the second word (field one) 230 self._header.version = parts[1] 231 232 # Check there is a third word (the date) 233 if len(parts) >= 3: 234 if parts[2][0] == "[" and parts[2][-1] == "]": 235 self._header.date = parts[2][1:-1] 236 else: 237 # Assume this is still a date, but without the 238 # square brackets 239 self._header.date = parts[2]
240
242 """a reference to the article describing the algorithm (PRIVATE). 243 244 Save this to put on each blast record object 245 """ 246 self._header.reference = self._value
247
248 - def _end_BlastOutput_db(self):
249 """the database(s) searched (PRIVATE). 250 251 Save this to put on each blast record object 252 """ 253 self._header.database = self._value
254
256 """the identifier of the query (PRIVATE). 257 258 Important in old pre 2.2.14 BLAST, for recent versions 259 <Iteration_query-ID> is enough 260 """ 261 self._header.query_id = self._value
262
264 """the definition line of the query (PRIVATE). 265 266 Important in old pre 2.2.14 BLAST, for recent versions 267 <Iteration_query-def> is enough 268 """ 269 self._header.query = self._value
270
272 """the length of the query (PRIVATE). 273 274 Important in old pre 2.2.14 BLAST, for recent versions 275 <Iteration_query-len> is enough 276 """ 277 self._header.query_letters = int(self._value)
278
279 - def _end_Iteration_query_ID(self):
280 """the identifier of the query (PRIVATE).""" 281 self._blast.query_id = self._value
282
283 - def _end_Iteration_query_def(self):
284 """the definition line of the query (PRIVATE).""" 285 self._blast.query = self._value
286
287 - def _end_Iteration_query_len(self):
288 """the length of the query (PRIVATE).""" 289 self._blast.query_letters = int(self._value)
290 291 # def _end_BlastOutput_query_seq(self): 292 # """the query sequence (PRIVATE).""" 293 # pass # XXX Missing in Record.Blast ? 294 295 # def _end_BlastOutput_iter_num(self): 296 # """the psi-blast iteration number (PRIVATE).""" 297 # pass # XXX TODO PSI 298
299 - def _end_BlastOutput_hits(self):
300 """hits to the database sequences, one for every sequence (PRIVATE).""" 301 self._blast.num_hits = int(self._value)
302 303 # def _end_BlastOutput_message(self): 304 # """error messages (PRIVATE).""" 305 # pass # XXX What to do ? 306 307 # Parameters
308 - def _end_Parameters_matrix(self):
309 """matrix used (-M on legacy BLAST) (PRIVATE).""" 310 self._parameters.matrix = self._value
311
312 - def _end_Parameters_expect(self):
313 """expect values cutoff (PRIVATE).""" 314 # NOTE: In old text output there was a line: 315 # Number of sequences better than 1.0e-004: 1 316 # As far as I can see, parameters.num_seqs_better_e 317 # would take the value of 1, and the expectation 318 # value was not recorded. 319 # 320 # Anyway we should NOT record this against num_seqs_better_e 321 self._parameters.expect = self._value
322 323 # def _end_Parameters_include(self): 324 # """inclusion threshold for a psi-blast iteration (-h) (PRIVATE).""" 325 # pass # XXX TODO PSI 326
327 - def _end_Parameters_sc_match(self):
328 """match score for nucleotide-nucleotide comparison (-r) (PRIVATE).""" 329 self._parameters.sc_match = int(self._value)
330
332 """mismatch penalty for nucleotide-nucleotide comparison (-r) (PRIVATE).""" 333 self._parameters.sc_mismatch = int(self._value)
334
335 - def _end_Parameters_gap_open(self):
336 """gap existence cost (-G) (PRIVATE).""" 337 self._parameters.gap_penalties = int(self._value)
338
340 """gap extension cose (-E) (PRIVATE).""" 341 self._parameters.gap_penalties = (self._parameters.gap_penalties, 342 int(self._value))
343
344 - def _end_Parameters_filter(self):
345 """filtering options (-F) (PRIVATE).""" 346 self._parameters.filter = self._value
347 348 # def _end_Parameters_pattern(self): 349 # """pattern used for phi-blast search 350 # """ 351 # pass # XXX TODO PSI 352 353 # def _end_Parameters_entrez_query(self): 354 # """entrez query used to limit search 355 # """ 356 # pass # XXX TODO PSI 357 358 # Hits
359 - def _start_Hit(self):
360 self._blast.alignments.append(Record.Alignment()) 361 self._blast.descriptions.append(Record.Description()) 362 self._blast.multiple_alignment = [] 363 self._hit = self._blast.alignments[-1] 364 self._descr = self._blast.descriptions[-1] 365 self._descr.num_alignments = 0
366
367 - def _end_Hit(self):
368 # Cleanup 369 self._blast.multiple_alignment = None 370 self._hit = None 371 self._descr = None
372
373 - def _end_Hit_id(self):
374 """identifier of the database sequence (PRIVATE).""" 375 self._hit.hit_id = self._value 376 self._hit.title = self._value + ' '
377
378 - def _end_Hit_def(self):
379 """definition line of the database sequence (PRIVATE).""" 380 self._hit.hit_def = self._value 381 self._hit.title += self._value 382 self._descr.title = self._hit.title
383
384 - def _end_Hit_accession(self):
385 """accession of the database sequence (PRIVATE).""" 386 self._hit.accession = self._value 387 self._descr.accession = self._value
388
389 - def _end_Hit_len(self):
390 self._hit.length = int(self._value)
391 392 # HSPs
393 - def _start_Hsp(self):
394 # Note that self._start_Hit() should have been called 395 # to setup things like self._blast.multiple_alignment 396 self._hit.hsps.append(Record.HSP()) 397 self._hsp = self._hit.hsps[-1] 398 self._descr.num_alignments += 1 399 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 400 self._mult_al = self._blast.multiple_alignment[-1]
401 402 # Hsp_num is useless
403 - def _end_Hsp_score(self):
404 """raw score of HSP (PRIVATE).""" 405 self._hsp.score = float(self._value) 406 if self._descr.score is None: 407 self._descr.score = float(self._value)
408
409 - def _end_Hsp_bit_score(self):
410 """bit score of HSP (PRIVATE).""" 411 self._hsp.bits = float(self._value) 412 if self._descr.bits is None: 413 self._descr.bits = float(self._value)
414
415 - def _end_Hsp_evalue(self):
416 """expect value of the HSP (PRIVATE).""" 417 self._hsp.expect = float(self._value) 418 if self._descr.e is None: 419 self._descr.e = float(self._value)
420
421 - def _end_Hsp_query_from(self):
422 """offset of query at the start of the alignment (one-offset) (PRIVATE).""" 423 self._hsp.query_start = int(self._value)
424
425 - def _end_Hsp_query_to(self):
426 """offset of query at the end of the alignment (one-offset) (PRIVATE).""" 427 self._hsp.query_end = int(self._value)
428
429 - def _end_Hsp_hit_from(self):
430 """offset of the database at the start of the alignment (one-offset) (PRIVATE).""" 431 self._hsp.sbjct_start = int(self._value)
432
433 - def _end_Hsp_hit_to(self):
434 """offset of the database at the end of the alignment (one-offset) (PRIVATE).""" 435 self._hsp.sbjct_end = int(self._value)
436 437 # def _end_Hsp_pattern_from(self): 438 # """start of phi-blast pattern on the query (one-offset) (PRIVATE).""" 439 # pass # XXX TODO PSI 440 441 # def _end_Hsp_pattern_to(self): 442 # """end of phi-blast pattern on the query (one-offset) (PRIVATE).""" 443 # pass # XXX TODO PSI 444
445 - def _end_Hsp_query_frame(self):
446 """frame of the query if applicable (PRIVATE).""" 447 self._hsp.frame = (int(self._value),)
448
449 - def _end_Hsp_hit_frame(self):
450 """frame of the database sequence if applicable (PRIVATE).""" 451 self._hsp.frame += (int(self._value),)
452
453 - def _end_Hsp_identity(self):
454 """number of identities in the alignment (PRIVATE).""" 455 self._hsp.identities = int(self._value)
456
457 - def _end_Hsp_positive(self):
458 """number of positive (conservative) substitutions in the alignment (PRIVATE).""" 459 self._hsp.positives = int(self._value)
460
461 - def _end_Hsp_gaps(self):
462 """number of gaps in the alignment (PRIVATE).""" 463 self._hsp.gaps = int(self._value)
464
465 - def _end_Hsp_align_len(self):
466 """length of the alignment (PRIVATE).""" 467 self._hsp.align_length = int(self._value)
468 469 # def _en_Hsp_density(self): 470 # """score density (PRIVATE).""" 471 # pass # XXX ??? 472
473 - def _end_Hsp_qseq(self):
474 """alignment string for the query (PRIVATE).""" 475 self._hsp.query = self._value
476
477 - def _end_Hsp_hseq(self):
478 """alignment string for the database (PRIVATE).""" 479 self._hsp.sbjct = self._value
480
481 - def _end_Hsp_midline(self):
482 """Formatting middle line as normally seen in BLAST report (PRIVATE).""" 483 self._hsp.match = self._value # do NOT strip spaces! 484 assert len(self._hsp.match) == len(self._hsp.query) 485 assert len(self._hsp.match) == len(self._hsp.sbjct)
486 487 # Statistics
488 - def _end_Statistics_db_num(self):
489 """number of sequences in the database (PRIVATE).""" 490 self._blast.num_sequences_in_database = int(self._value)
491
492 - def _end_Statistics_db_len(self):
493 """number of letters in the database (PRIVATE).""" 494 self._blast.num_letters_in_database = int(self._value)
495
496 - def _end_Statistics_hsp_len(self):
497 """the effective HSP length (PRIVATE).""" 498 self._blast.effective_hsp_length = int(self._value)
499
501 """the effective search space (PRIVATE).""" 502 self._blast.effective_search_space = float(self._value)
503
504 - def _end_Statistics_kappa(self):
505 """Karlin-Altschul parameter K (PRIVATE).""" 506 self._blast.ka_params = float(self._value)
507
508 - def _end_Statistics_lambda(self):
509 """Karlin-Altschul parameter Lambda (PRIVATE).""" 510 self._blast.ka_params = (float(self._value), self._blast.ka_params)
511
512 - def _end_Statistics_entropy(self):
513 """Karlin-Altschul parameter H (PRIVATE).""" 514 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
515 516
517 -def read(handle, debug=0):
518 """Returns a single Blast record (assumes just one query). 519 520 Uses the BlastParser internally. 521 522 This function is for use when there is one and only one BLAST 523 result in your XML file. 524 525 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 526 one BLAST record (i.e. if you have more than one query sequence). 527 """ 528 iterator = parse(handle, debug) 529 try: 530 first = next(iterator) 531 except StopIteration: 532 first = None 533 if first is None: 534 raise ValueError("No records found in handle") 535 try: 536 second = next(iterator) 537 except StopIteration: 538 second = None 539 if second is not None: 540 raise ValueError("More than one record found in handle") 541 return first
542 543
544 -def parse(handle, debug=0):
545 """Returns an iterator a Blast record for each query. 546 547 Incremental parser, this is an iterator that returns 548 Blast records. It uses the BlastParser internally. 549 550 handle - file handle to and XML file to parse 551 debug - integer, amount of debug information to print 552 553 This is a generator function that returns multiple Blast records 554 objects - one for each query sequence given to blast. The file 555 is read incrementally, returning complete records as they are read 556 in. 557 558 Should cope with new BLAST 2.2.14+ which gives a single XML file 559 for multiple query records. 560 561 Should also cope with XML output from older versions BLAST which 562 gave multiple XML files concatenated together (giving a single file 563 which strictly speaking wasn't valid XML). 564 """ 565 from xml.parsers import expat 566 BLOCK = 1024 567 MARGIN = 10 # must be at least length of newline + XML start 568 XML_START = "<?xml" 569 570 text = handle.read(BLOCK) 571 pending = "" 572 573 if not text: 574 # NO DATA FOUND! 575 raise ValueError("Your XML file was empty") 576 577 while text: 578 # We are now starting a new XML file 579 if not text.startswith(XML_START): 580 raise ValueError("Your XML file did not start with %s... " 581 "but instead %s" 582 % (XML_START, repr(text[:20]))) 583 584 expat_parser = expat.ParserCreate() 585 blast_parser = BlastParser(debug) 586 expat_parser.StartElementHandler = blast_parser.startElement 587 expat_parser.EndElementHandler = blast_parser.endElement 588 expat_parser.CharacterDataHandler = blast_parser.characters 589 590 expat_parser.Parse(text, False) 591 while blast_parser._records: 592 record = blast_parser._records[0] 593 blast_parser._records = blast_parser._records[1:] 594 yield record 595 596 while True: 597 # Read in another block of the file... 598 text, pending = pending + handle.read(BLOCK), "" 599 if not text: 600 # End of the file! 601 expat_parser.Parse("", True) # End of XML record 602 break 603 604 # Now read a little bit more so we can check for the 605 # start of another XML file... 606 pending = handle.read(MARGIN) 607 608 if ("\n" + XML_START) not in (text + pending): 609 # Good - still dealing with the same XML file 610 expat_parser.Parse(text, False) 611 while blast_parser._records: 612 yield blast_parser._records.pop(0) 613 else: 614 # This is output from pre 2.2.14 BLAST, 615 # one XML file for each query! 616 617 # Finish the old file: 618 text, pending = (text + pending).split("\n" + XML_START, 1) 619 pending = XML_START + pending 620 621 expat_parser.Parse(text, True) # End of XML record 622 while blast_parser._records: 623 yield blast_parser._records.pop(0) 624 625 # Now we are going to re-loop, reset the 626 # parsers and start reading the next XML file 627 text, pending = pending, "" 628 break 629 630 # this was added because it seems that the Jython expat parser 631 # was adding records later then the Python one 632 while blast_parser._records: 633 yield blast_parser._records.pop(0) 634 635 # At this point we have finished the first XML record. 636 # If the file is from an old version of blast, it may 637 # contain more XML records (check if text==""). 638 assert pending == "" 639 assert len(blast_parser._records) == 0 640 641 # We should have finished the file! 642 assert text == "" 643 assert pending == "" 644 assert len(blast_parser._records) == 0
645 646 if __name__ == '__main__': 647 import sys 648 with open(sys.argv[1]) as handle: 649 r_list = parse(handle) 650 651 for r in r_list: 652 # Small test 653 print('Blast of %s' % r.query) 654 print('Found %s alignments with a total of %s HSPs' 655 % (len(r.alignments), 656 reduce(lambda a, b: a + b, 657 [len(a.hsps) for a in r.alignments]))) 658 659 for al in r.alignments: 660 print("%s %i bp %i HSPs" % (al.title[:50], al.length, len(al.hsps))) 661 662 # Cookbook example 663 E_VALUE_THRESH = 0.04 664 for alignment in r.alignments: 665 for hsp in alignment.hsps: 666 if hsp.expect < E_VALUE_THRESH: 667 print('*****') 668 print('sequence %s' % alignment.title) 669 print('length %i' % alignment.length) 670 print('e value %f' % hsp.expect) 671 print(hsp.query[:75] + '...') 672 print(hsp.match[:75] + '...') 673 print(hsp.sbjct[:75] + '...') 674