Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """This module provides code to work with the BLAST XML output 
  8  following the DTD available on the NCBI FTP 
  9  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 10   
 11  Classes: 
 12  BlastParser         Parses XML output from BLAST (direct use discouraged). 
 13                      This (now) returns a list of Blast records. 
 14                      Historically it returned a single Blast record. 
 15                      You are expected to use this via the parse or read functions. 
 16   
 17  _XMLParser          Generic SAX parser (private). 
 18   
 19  Functions: 
 20  parse               Incremental parser, this is an iterator that returns 
 21                      Blast records.  It uses the BlastParser internally. 
 22  read                Returns a single Blast record. Uses the BlastParser internally. 
 23  """ 
 24  from Bio.Blast import Record 
 25  import xml.sax 
 26  from xml.sax.handler import ContentHandler 
 27   
 28   
29 -class _XMLparser(ContentHandler):
30 """Generic SAX Parser 31 32 Just a very basic SAX parser. 33 34 Redefine the methods startElement, characters and endElement. 35 """
36 - def __init__(self, debug=0):
37 """Constructor 38 39 debug - integer, amount of debug information to print 40 """ 41 self._tag = [] 42 self._value = '' 43 self._debug = debug 44 self._debug_ignore_list = []
45
46 - def _secure_name(self, name):
47 """Removes 'dangerous' from tag names 48 49 name -- name to be 'secured' 50 """ 51 # Replace '-' with '_' in XML tag names 52 return name.replace('-', '_')
53
54 - def startElement(self, name, attr):
55 """Found XML start tag 56 57 No real need of attr, BLAST DTD doesn't use them 58 59 name -- name of the tag 60 61 attr -- tag attributes 62 """ 63 self._tag.append(name) 64 65 # Try to call a method (defined in subclasses) 66 method = self._secure_name('_start_' + name) 67 68 #Note could use try / except AttributeError 69 #BUT I found often triggered by nested errors... 70 if hasattr(self, method): 71 eval("self.%s()" % method) 72 if self._debug > 4: 73 print "NCBIXML: Parsed: " + method 74 elif self._debug > 3: 75 # Doesn't exist (yet) and may want to warn about it 76 if method not in self._debug_ignore_list: 77 print "NCBIXML: Ignored: " + method 78 self._debug_ignore_list.append(method) 79 80 #We don't care about white space in parent tags like Hsp, 81 #but that white space doesn't belong to child tags like Hsp_midline 82 if self._value.strip(): 83 raise ValueError("What should we do with %s before the %s tag?" 84 % (repr(self._value), name)) 85 self._value = ""
86
87 - def characters(self, ch):
88 """Found some text 89 90 ch -- characters read 91 """ 92 self._value += ch # You don't ever get the whole string
93
94 - def endElement(self, name):
95 """Found XML end tag 96 97 name -- tag name 98 """ 99 # DON'T strip any white space, we may need it e.g. the hsp-midline 100 101 # Try to call a method (defined in subclasses) 102 method = self._secure_name('_end_' + name) 103 #Note could use try / except AttributeError 104 #BUT I found often triggered by nested errors... 105 if hasattr(self, method): 106 eval("self.%s()" % method) 107 if self._debug > 2: 108 print "NCBIXML: Parsed: " + method, self._value 109 elif self._debug > 1: 110 # Doesn't exist (yet) and may want to warn about it 111 if method not in self._debug_ignore_list: 112 print "NCBIXML: Ignored: " + method, self._value 113 self._debug_ignore_list.append(method) 114 115 # Reset character buffer 116 self._value = ''
117 118
119 -class BlastParser(_XMLparser):
120 """Parse XML BLAST data into a Record.Blast object 121 122 All XML 'action' methods are private methods and may be: 123 _start_TAG called when the start tag is found 124 _end_TAG called when the end tag is found 125 """ 126
127 - def __init__(self, debug=0):
128 """Constructor 129 130 debug - integer, amount of debug information to print 131 """ 132 # Calling superclass method 133 _XMLparser.__init__(self, debug) 134 135 self._parser = xml.sax.make_parser() 136 self._parser.setContentHandler(self) 137 138 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 139 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 140 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 141 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 142 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 143 144 self.reset()
145
146 - def reset(self):
147 """Reset all the data allowing reuse of the BlastParser() object""" 148 self._records = [] 149 self._header = Record.Header() 150 self._parameters = Record.Parameters() 151 self._parameters.filter = None # Maybe I should update the class?
152
153 - def _start_Iteration(self):
154 self._blast = Record.Blast() 155 pass
156
157 - def _end_Iteration(self):
158 # We stored a lot of generic "top level" information 159 # in self._header (an object of type Record.Header) 160 self._blast.reference = self._header.reference 161 self._blast.date = self._header.date 162 self._blast.version = self._header.version 163 self._blast.database = self._header.database 164 self._blast.application = self._header.application 165 166 # These are required for "old" pre 2.2.14 files 167 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 168 # and <BlastOutput_query-len> were used. Now they 169 # are suplemented/replaced by <Iteration_query-ID>, 170 # <Iteration_query-def> and <Iteration_query-len> 171 if not hasattr(self._blast, "query") \ 172 or not self._blast.query: 173 self._blast.query = self._header.query 174 if not hasattr(self._blast, "query_id") \ 175 or not self._blast.query_id: 176 self._blast.query_id = self._header.query_id 177 if not hasattr(self._blast, "query_letters") \ 178 or not self._blast.query_letters: 179 self._blast.query_letters = self._header.query_letters 180 181 # Hack to record the query length as both the query_letters and 182 # query_length properties (as in the plain text parser, see 183 # Bug 2176 comment 12): 184 self._blast.query_length = self._blast.query_letters 185 # Perhaps in the long term we should deprecate one, but I would 186 # prefer to drop query_letters - so we need a transition period 187 # with both. 188 189 # Hack to record the claimed database size as database_length 190 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 191 self._blast.database_length = self._blast.num_letters_in_database 192 # TODO? Deprecate database_letters next? 193 194 # Hack to record the claimed database sequence count as database_sequences 195 self._blast.database_sequences = self._blast.num_sequences_in_database 196 197 # Apply the "top level" parameter information 198 self._blast.matrix = self._parameters.matrix 199 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 200 self._blast.gap_penalties = self._parameters.gap_penalties 201 self._blast.filter = self._parameters.filter 202 self._blast.expect = self._parameters.expect 203 self._blast.sc_match = self._parameters.sc_match 204 self._blast.sc_mismatch = self._parameters.sc_mismatch 205 206 #Add to the list 207 self._records.append(self._blast) 208 #Clear the object (a new empty one is create in _start_Iteration) 209 self._blast = None 210 211 if self._debug: 212 print "NCBIXML: Added Blast record to results"
213 214 # Header
215 - def _end_BlastOutput_program(self):
216 """BLAST program, e.g., blastp, blastn, etc. 217 218 Save this to put on each blast record object 219 """ 220 self._header.application = self._value.upper()
221
222 - def _end_BlastOutput_version(self):
223 """version number and date of the BLAST engine. 224 225 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 226 variants like "BLASTP 2.2.18+" without the date. 227 228 Save this to put on each blast record object 229 """ 230 parts = self._value.split() 231 #TODO - Check the first word starts with BLAST? 232 233 #The version is the second word (field one) 234 self._header.version = parts[1] 235 236 #Check there is a third word (the date) 237 if len(parts) >= 3: 238 if parts[2][0] == "[" and parts[2][-1] == "]": 239 self._header.date = parts[2][1:-1] 240 else: 241 #Assume this is still a date, but without the 242 #square brackets 243 self._header.date = parts[2]
244
246 """a reference to the article describing the algorithm 247 248 Save this to put on each blast record object 249 """ 250 self._header.reference = self._value
251
252 - def _end_BlastOutput_db(self):
253 """the database(s) searched 254 255 Save this to put on each blast record object 256 """ 257 self._header.database = self._value
258
260 """the identifier of the query 261 262 Important in old pre 2.2.14 BLAST, for recent versions 263 <Iteration_query-ID> is enough 264 """ 265 self._header.query_id = self._value
266
268 """the definition line of the query 269 270 Important in old pre 2.2.14 BLAST, for recent versions 271 <Iteration_query-def> is enough 272 """ 273 self._header.query = self._value
274
276 """the length of the query 277 278 Important in old pre 2.2.14 BLAST, for recent versions 279 <Iteration_query-len> is enough 280 """ 281 self._header.query_letters = int(self._value)
282
283 - def _end_Iteration_query_ID(self):
284 """the identifier of the query 285 """ 286 self._blast.query_id = self._value
287
288 - def _end_Iteration_query_def(self):
289 """the definition line of the query 290 """ 291 self._blast.query = self._value
292
293 - def _end_Iteration_query_len(self):
294 """the length of the query 295 """ 296 self._blast.query_letters = int(self._value)
297 298 ## def _end_BlastOutput_query_seq(self): 299 ## """the query sequence 300 ## """ 301 ## pass # XXX Missing in Record.Blast ? 302 303 ## def _end_BlastOutput_iter_num(self): 304 ## """the psi-blast iteration number 305 ## """ 306 ## pass # XXX TODO PSI 307
308 - def _end_BlastOutput_hits(self):
309 """hits to the database sequences, one for every sequence 310 """ 311 self._blast.num_hits = int(self._value)
312 313 ## def _end_BlastOutput_message(self): 314 ## """error messages 315 ## """ 316 ## pass # XXX What to do ? 317 318 # Parameters
319 - def _end_Parameters_matrix(self):
320 """matrix used (-M) 321 """ 322 self._parameters.matrix = self._value
323
324 - def _end_Parameters_expect(self):
325 """expect values cutoff (-e) 326 """ 327 # NOTE: In old text output there was a line: 328 # Number of sequences better than 1.0e-004: 1 329 # As far as I can see, parameters.num_seqs_better_e 330 # would take the value of 1, and the expectation 331 # value was not recorded. 332 # 333 # Anyway we should NOT record this against num_seqs_better_e 334 self._parameters.expect = self._value
335 336 ## def _end_Parameters_include(self): 337 ## """inclusion threshold for a psi-blast iteration (-h) 338 ## """ 339 ## pass # XXX TODO PSI 340
341 - def _end_Parameters_sc_match(self):
342 """match score for nucleotide-nucleotide comparaison (-r) 343 """ 344 self._parameters.sc_match = int(self._value)
345
347 """mismatch penalty for nucleotide-nucleotide comparaison (-r) 348 """ 349 self._parameters.sc_mismatch = int(self._value)
350
351 - def _end_Parameters_gap_open(self):
352 """gap existence cost (-G) 353 """ 354 self._parameters.gap_penalties = int(self._value)
355
357 """gap extension cose (-E) 358 """ 359 self._parameters.gap_penalties = (self._parameters.gap_penalties, 360 int(self._value))
361
362 - def _end_Parameters_filter(self):
363 """filtering options (-F) 364 """ 365 self._parameters.filter = self._value
366 367 ## def _end_Parameters_pattern(self): 368 ## """pattern used for phi-blast search 369 ## """ 370 ## pass # XXX TODO PSI 371 372 ## def _end_Parameters_entrez_query(self): 373 ## """entrez query used to limit search 374 ## """ 375 ## pass # XXX TODO PSI 376 377 # Hits
378 - def _start_Hit(self):
379 self._blast.alignments.append(Record.Alignment()) 380 self._blast.descriptions.append(Record.Description()) 381 self._blast.multiple_alignment = [] 382 self._hit = self._blast.alignments[-1] 383 self._descr = self._blast.descriptions[-1] 384 self._descr.num_alignments = 0
385
386 - def _end_Hit(self):
387 #Cleanup 388 self._blast.multiple_alignment = None 389 self._hit = None 390 self._descr = None
391
392 - def _end_Hit_id(self):
393 """identifier of the database sequence 394 """ 395 self._hit.hit_id = self._value 396 self._hit.title = self._value + ' '
397
398 - def _end_Hit_def(self):
399 """definition line of the database sequence 400 """ 401 self._hit.hit_def = self._value 402 self._hit.title += self._value 403 self._descr.title = self._hit.title
404
405 - def _end_Hit_accession(self):
406 """accession of the database sequence 407 """ 408 self._hit.accession = self._value 409 self._descr.accession = self._value
410
411 - def _end_Hit_len(self):
412 self._hit.length = int(self._value)
413 414 # HSPs
415 - def _start_Hsp(self):
416 #Note that self._start_Hit() should have been called 417 #to setup things like self._blast.multiple_alignment 418 self._hit.hsps.append(Record.HSP()) 419 self._hsp = self._hit.hsps[-1] 420 self._descr.num_alignments += 1 421 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 422 self._mult_al = self._blast.multiple_alignment[-1]
423 424 # Hsp_num is useless
425 - def _end_Hsp_score(self):
426 """raw score of HSP 427 """ 428 self._hsp.score = float(self._value) 429 if self._descr.score is None: 430 self._descr.score = float(self._value)
431
432 - def _end_Hsp_bit_score(self):
433 """bit score of HSP 434 """ 435 self._hsp.bits = float(self._value) 436 if self._descr.bits is None: 437 self._descr.bits = float(self._value)
438
439 - def _end_Hsp_evalue(self):
440 """expect value of the HSP 441 """ 442 self._hsp.expect = float(self._value) 443 if self._descr.e is None: 444 self._descr.e = float(self._value)
445
446 - def _end_Hsp_query_from(self):
447 """offset of query at the start of the alignment (one-offset) 448 """ 449 self._hsp.query_start = int(self._value)
450
451 - def _end_Hsp_query_to(self):
452 """offset of query at the end of the alignment (one-offset) 453 """ 454 self._hsp.query_end = int(self._value)
455
456 - def _end_Hsp_hit_from(self):
457 """offset of the database at the start of the alignment (one-offset) 458 """ 459 self._hsp.sbjct_start = int(self._value)
460
461 - def _end_Hsp_hit_to(self):
462 """offset of the database at the end of the alignment (one-offset) 463 """ 464 self._hsp.sbjct_end = int(self._value)
465 466 ## def _end_Hsp_pattern_from(self): 467 ## """start of phi-blast pattern on the query (one-offset) 468 ## """ 469 ## pass # XXX TODO PSI 470 471 ## def _end_Hsp_pattern_to(self): 472 ## """end of phi-blast pattern on the query (one-offset) 473 ## """ 474 ## pass # XXX TODO PSI 475
476 - def _end_Hsp_query_frame(self):
477 """frame of the query if applicable 478 """ 479 self._hsp.frame = (int(self._value),)
480
481 - def _end_Hsp_hit_frame(self):
482 """frame of the database sequence if applicable 483 """ 484 self._hsp.frame += (int(self._value),)
485
486 - def _end_Hsp_identity(self):
487 """number of identities in the alignment 488 """ 489 self._hsp.identities = int(self._value)
490
491 - def _end_Hsp_positive(self):
492 """number of positive (conservative) substitutions in the alignment 493 """ 494 self._hsp.positives = int(self._value)
495
496 - def _end_Hsp_gaps(self):
497 """number of gaps in the alignment 498 """ 499 self._hsp.gaps = int(self._value)
500
501 - def _end_Hsp_align_len(self):
502 """length of the alignment 503 """ 504 self._hsp.align_length = int(self._value)
505 506 ## def _en_Hsp_density(self): 507 ## """score density 508 ## """ 509 ## pass # XXX ??? 510
511 - def _end_Hsp_qseq(self):
512 """alignment string for the query 513 """ 514 self._hsp.query = self._value
515
516 - def _end_Hsp_hseq(self):
517 """alignment string for the database 518 """ 519 self._hsp.sbjct = self._value
520
521 - def _end_Hsp_midline(self):
522 """Formatting middle line as normally seen in BLAST report 523 """ 524 self._hsp.match = self._value # do NOT strip spaces! 525 assert len(self._hsp.match)==len(self._hsp.query) 526 assert len(self._hsp.match)==len(self._hsp.sbjct)
527 528 # Statistics
529 - def _end_Statistics_db_num(self):
530 """number of sequences in the database 531 """ 532 self._blast.num_sequences_in_database = int(self._value)
533
534 - def _end_Statistics_db_len(self):
535 """number of letters in the database 536 """ 537 self._blast.num_letters_in_database = int(self._value)
538
539 - def _end_Statistics_hsp_len(self):
540 """the effective HSP length 541 """ 542 self._blast.effective_hsp_length = int(self._value)
543
545 """the effective search space 546 """ 547 self._blast.effective_search_space = float(self._value)
548
549 - def _end_Statistics_kappa(self):
550 """Karlin-Altschul parameter K 551 """ 552 self._blast.ka_params = float(self._value)
553
554 - def _end_Statistics_lambda(self):
555 """Karlin-Altschul parameter Lambda 556 """ 557 self._blast.ka_params = (float(self._value), 558 self._blast.ka_params)
559
560 - def _end_Statistics_entropy(self):
561 """Karlin-Altschul parameter H 562 """ 563 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
564 565
566 -def read(handle, debug=0):
567 """Returns a single Blast record (assumes just one query). 568 569 This function is for use when there is one and only one BLAST 570 result in your XML file. 571 572 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 573 one BLAST record (i.e. if you have more than one query sequence). 574 575 """ 576 iterator = parse(handle, debug) 577 try: 578 first = iterator.next() 579 except StopIteration: 580 first = None 581 if first is None: 582 raise ValueError("No records found in handle") 583 try: 584 second = iterator.next() 585 except StopIteration: 586 second = None 587 if second is not None: 588 raise ValueError("More than one record found in handle") 589 return first
590 591
592 -def parse(handle, debug=0):
593 """Returns an iterator a Blast record for each query. 594 595 handle - file handle to and XML file to parse 596 debug - integer, amount of debug information to print 597 598 This is a generator function that returns multiple Blast records 599 objects - one for each query sequence given to blast. The file 600 is read incrementally, returning complete records as they are read 601 in. 602 603 Should cope with new BLAST 2.2.14+ which gives a single XML file 604 for multiple query records. 605 606 Should also cope with XML output from older versions BLAST which 607 gave multiple XML files concatenated together (giving a single file 608 which strictly speaking wasn't valid XML).""" 609 from xml.parsers import expat 610 BLOCK = 1024 611 MARGIN = 10 # must be at least length of newline + XML start 612 XML_START = "<?xml" 613 614 text = handle.read(BLOCK) 615 pending = "" 616 617 if not text: 618 #NO DATA FOUND! 619 raise ValueError("Your XML file was empty") 620 621 while text: 622 #We are now starting a new XML file 623 if not text.startswith(XML_START): 624 raise ValueError("Your XML file did not start with %s... " 625 "but instead %s" 626 % (XML_START, repr(text[:20]))) 627 628 expat_parser = expat.ParserCreate() 629 blast_parser = BlastParser(debug) 630 expat_parser.StartElementHandler = blast_parser.startElement 631 expat_parser.EndElementHandler = blast_parser.endElement 632 expat_parser.CharacterDataHandler = blast_parser.characters 633 634 expat_parser.Parse(text, False) 635 while blast_parser._records: 636 record = blast_parser._records[0] 637 blast_parser._records = blast_parser._records[1:] 638 yield record 639 640 while True: 641 #Read in another block of the file... 642 text, pending = pending + handle.read(BLOCK), "" 643 if not text: 644 #End of the file! 645 expat_parser.Parse("", True) # End of XML record 646 break 647 648 #Now read a little bit more so we can check for the 649 #start of another XML file... 650 pending = handle.read(MARGIN) 651 652 if ("\n" + XML_START) not in (text + pending): 653 # Good - still dealing with the same XML file 654 expat_parser.Parse(text, False) 655 while blast_parser._records: 656 yield blast_parser._records.pop(0) 657 else: 658 # This is output from pre 2.2.14 BLAST, 659 # one XML file for each query! 660 661 # Finish the old file: 662 text, pending = (text+pending).split("\n" + XML_START,1) 663 pending = XML_START + pending 664 665 expat_parser.Parse(text, True) # End of XML record 666 while blast_parser._records: 667 yield blast_parser._records.pop(0) 668 669 #Now we are going to re-loop, reset the 670 #parsers and start reading the next XML file 671 text, pending = pending, "" 672 break 673 674 #this was added because it seems that the Jython expat parser 675 #was adding records later then the Python one 676 while blast_parser._records: 677 yield blast_parser._records.pop(0) 678 679 #At this point we have finished the first XML record. 680 #If the file is from an old version of blast, it may 681 #contain more XML records (check if text==""). 682 assert pending=="" 683 assert len(blast_parser._records) == 0 684 685 #We should have finished the file! 686 assert text=="" 687 assert pending=="" 688 assert len(blast_parser._records) == 0
689 690 if __name__ == '__main__': 691 import sys 692 handle = open(sys.argv[1]) 693 r_list = parse(handle) 694 695 for r in r_list: 696 # Small test 697 print 'Blast of', r.query 698 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments), 699 reduce(lambda a,b: a+b, 700 [len(a.hsps) for a in r.alignments])) 701 702 for al in r.alignments: 703 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs' 704 705 # Cookbook example 706 E_VALUE_THRESH = 0.04 707 for alignment in r.alignments: 708 for hsp in alignment.hsps: 709 if hsp.expect < E_VALUE_THRESH: 710 print '*****' 711 print 'sequence', alignment.title 712 print 'length', alignment.length 713 print 'e value', hsp.expect 714 print hsp.query[:75] + '...' 715 print hsp.match[:75] + '...' 716 print hsp.sbjct[:75] + '...' 717