Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Code to work with the BLAST XML output. 
  8   
  9  The BLAST XML DTD file is on the NCBI FTP site at: 
 10  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 11  """ 
 12  from __future__ import print_function 
 13   
 14  from Bio.Blast import Record 
 15  import xml.sax 
 16  from xml.sax.handler import ContentHandler 
 17   
 18   
19 -class _XMLparser(ContentHandler):
20 """Generic SAX Parser (PRIVATE). 21 22 Just a very basic SAX parser. 23 24 Redefine the methods startElement, characters and endElement. 25 """ 26
27 - def __init__(self, debug=0):
28 """Constructor. 29 30 Arguments: 31 - debug - integer, amount of debug information to print 32 33 """ 34 self._tag = [] 35 self._value = '' 36 self._debug = debug 37 self._debug_ignore_list = []
38
39 - def _secure_name(self, name):
40 """Removes 'dangerous' from tag names. 41 42 Arguments: 43 - name -- name to be 'secured' 44 45 """ 46 # Replace '-' with '_' in XML tag names 47 return name.replace('-', '_')
48
49 - def startElement(self, name, attr):
50 """Found XML start tag. 51 52 No real need of attr, BLAST DTD doesn't use them 53 54 Arguments: 55 - name -- name of the tag 56 - attr -- tag attributes 57 58 """ 59 self._tag.append(name) 60 61 # Try to call a method (defined in subclasses) 62 method = self._secure_name('_start_' + name) 63 64 # Note could use try / except AttributeError 65 # BUT I found often triggered by nested errors... 66 if hasattr(self, method): 67 getattr(self, method)() 68 if self._debug > 4: 69 print("NCBIXML: Parsed: " + method) 70 elif self._debug > 3: 71 # Doesn't exist (yet) and may want to warn about it 72 if method not in self._debug_ignore_list: 73 print("NCBIXML: Ignored: " + method) 74 self._debug_ignore_list.append(method) 75 76 # We don't care about white space in parent tags like Hsp, 77 # but that white space doesn't belong to child tags like Hsp_midline 78 if self._value.strip(): 79 raise ValueError("What should we do with %s before the %s tag?" 80 % (repr(self._value), name)) 81 self._value = ""
82
83 - def characters(self, ch):
84 """Found some text. 85 86 Arguments: 87 - ch -- characters read 88 89 """ 90 self._value += ch # You don't ever get the whole string
91
92 - def endElement(self, name):
93 """Found XML end tag. 94 95 Arguments: 96 - name -- tag name 97 98 """ 99 # DON'T strip any white space, we may need it e.g. the hsp-midline 100 101 # Try to call a method (defined in subclasses) 102 method = self._secure_name('_end_' + name) 103 # Note could use try / except AttributeError 104 # BUT I found often triggered by nested errors... 105 if hasattr(self, method): 106 getattr(self, method)() 107 if self._debug > 2: 108 print("NCBIXML: Parsed: %s %s" % (method, self._value)) 109 elif self._debug > 1: 110 # Doesn't exist (yet) and may want to warn about it 111 if method not in self._debug_ignore_list: 112 print("NCBIXML: Ignored: %s %s" % (method, self._value)) 113 self._debug_ignore_list.append(method) 114 115 # Reset character buffer 116 self._value = ''
117 118
119 -class BlastParser(_XMLparser):
120 """Parse XML BLAST data into a Record.Blast object. 121 122 Parses XML output from BLAST (direct use discouraged). 123 This (now) returns a list of Blast records. 124 Historically it returned a single Blast record. 125 You are expected to use this via the parse or read functions. 126 127 All XML 'action' methods are private methods and may be: 128 129 - ``_start_TAG`` called when the start tag is found 130 - ``_end_TAG`` called when the end tag is found 131 132 """ 133
134 - def __init__(self, debug=0):
135 """Constructor. 136 137 Arguments: 138 - debug - integer, amount of debug information to print 139 140 """ 141 # Calling superclass method 142 _XMLparser.__init__(self, debug) 143 144 self._parser = xml.sax.make_parser() 145 self._parser.setContentHandler(self) 146 147 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 148 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 149 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 150 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 151 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 152 153 self.reset()
154
155 - def reset(self):
156 """Reset all the data allowing reuse of the BlastParser() object.""" 157 self._records = [] 158 self._header = Record.Header() 159 self._parameters = Record.Parameters() 160 self._parameters.filter = None # Maybe I should update the class?
161
162 - def _start_Iteration(self):
163 self._blast = Record.Blast() 164 pass
165
166 - def _end_Iteration(self):
167 # We stored a lot of generic "top level" information 168 # in self._header (an object of type Record.Header) 169 self._blast.reference = self._header.reference 170 self._blast.date = self._header.date 171 self._blast.version = self._header.version 172 self._blast.database = self._header.database 173 self._blast.application = self._header.application 174 175 # These are required for "old" pre 2.2.14 files 176 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 177 # and <BlastOutput_query-len> were used. Now they 178 # are supplemented/replaced by <Iteration_query-ID>, 179 # <Iteration_query-def> and <Iteration_query-len> 180 if not hasattr(self._blast, "query") \ 181 or not self._blast.query: 182 self._blast.query = self._header.query 183 if not hasattr(self._blast, "query_id") \ 184 or not self._blast.query_id: 185 self._blast.query_id = self._header.query_id 186 if not hasattr(self._blast, "query_letters") \ 187 or not self._blast.query_letters: 188 self._blast.query_letters = self._header.query_letters 189 190 # Hack to record the query length as both the query_letters and 191 # query_length properties (as in the plain text parser, see 192 # Bug 2176 comment 12): 193 self._blast.query_length = self._blast.query_letters 194 # Perhaps in the long term we should deprecate one, but I would 195 # prefer to drop query_letters - so we need a transition period 196 # with both. 197 198 # Hack to record the claimed database size as database_length 199 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 200 self._blast.database_length = self._blast.num_letters_in_database 201 # TODO? Deprecate database_letters next? 202 203 # Hack to record the claimed database sequence count as database_sequences 204 self._blast.database_sequences = self._blast.num_sequences_in_database 205 206 # Apply the "top level" parameter information 207 self._blast.matrix = self._parameters.matrix 208 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 209 self._blast.gap_penalties = self._parameters.gap_penalties 210 self._blast.filter = self._parameters.filter 211 self._blast.expect = self._parameters.expect 212 self._blast.sc_match = self._parameters.sc_match 213 self._blast.sc_mismatch = self._parameters.sc_mismatch 214 215 # Add to the list 216 self._records.append(self._blast) 217 # Clear the object (a new empty one is create in _start_Iteration) 218 self._blast = None 219 220 if self._debug: 221 print("NCBIXML: Added Blast record to results")
222 223 # Header
224 - def _end_BlastOutput_program(self):
225 """BLAST program, e.g., blastp, blastn, etc. 226 227 Save this to put on each blast record object 228 """ 229 self._header.application = self._value.upper()
230
231 - def _end_BlastOutput_version(self):
232 """Version number and date of the BLAST engine. 233 234 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 235 variants like "BLASTP 2.2.18+" without the date. 236 237 Save this to put on each blast record object 238 """ 239 parts = self._value.split() 240 # TODO - Check the first word starts with BLAST? 241 242 # The version is the second word (field one) 243 self._header.version = parts[1] 244 245 # Check there is a third word (the date) 246 if len(parts) >= 3: 247 if parts[2][0] == "[" and parts[2][-1] == "]": 248 self._header.date = parts[2][1:-1] 249 else: 250 # Assume this is still a date, but without the 251 # square brackets 252 self._header.date = parts[2]
253
255 """A reference to the article describing the algorithm (PRIVATE). 256 257 Save this to put on each blast record object 258 """ 259 self._header.reference = self._value
260
261 - def _end_BlastOutput_db(self):
262 """The database(s) searched (PRIVATE). 263 264 Save this to put on each blast record object 265 """ 266 self._header.database = self._value
267
269 """The identifier of the query (PRIVATE). 270 271 Important in old pre 2.2.14 BLAST, for recent versions 272 <Iteration_query-ID> is enough 273 """ 274 self._header.query_id = self._value
275
277 """The definition line of the query (PRIVATE). 278 279 Important in old pre 2.2.14 BLAST, for recent versions 280 <Iteration_query-def> is enough 281 """ 282 self._header.query = self._value
283
285 """The length of the query (PRIVATE). 286 287 Important in old pre 2.2.14 BLAST, for recent versions 288 <Iteration_query-len> is enough 289 """ 290 self._header.query_letters = int(self._value)
291
292 - def _end_Iteration_query_ID(self):
293 """The identifier of the query (PRIVATE).""" 294 self._blast.query_id = self._value
295
296 - def _end_Iteration_query_def(self):
297 """The definition line of the query (PRIVATE).""" 298 self._blast.query = self._value
299
300 - def _end_Iteration_query_len(self):
301 """The length of the query (PRIVATE).""" 302 self._blast.query_letters = int(self._value)
303 304 # def _end_BlastOutput_query_seq(self): 305 # """The query sequence (PRIVATE).""" 306 # pass # XXX Missing in Record.Blast ? 307 308 # def _end_BlastOutput_iter_num(self): 309 # """The psi-blast iteration number (PRIVATE).""" 310 # pass # XXX TODO PSI 311
312 - def _end_BlastOutput_hits(self):
313 """Hits to the database sequences, one for every sequence (PRIVATE).""" 314 self._blast.num_hits = int(self._value)
315 316 # def _end_BlastOutput_message(self): 317 # """error messages (PRIVATE).""" 318 # pass # XXX What to do ? 319 320 # Parameters
321 - def _end_Parameters_matrix(self):
322 """Matrix used (-M on legacy BLAST) (PRIVATE).""" 323 self._parameters.matrix = self._value
324
325 - def _end_Parameters_expect(self):
326 """Expect values cutoff (PRIVATE).""" 327 # NOTE: In old text output there was a line: 328 # Number of sequences better than 1.0e-004: 1 329 # As far as I can see, parameters.num_seqs_better_e 330 # would take the value of 1, and the expectation 331 # value was not recorded. 332 # 333 # Anyway we should NOT record this against num_seqs_better_e 334 self._parameters.expect = self._value
335 336 # def _end_Parameters_include(self): 337 # """Inclusion threshold for a psi-blast iteration (-h) (PRIVATE).""" 338 # pass # XXX TODO PSI 339
340 - def _end_Parameters_sc_match(self):
341 """Match score for nucleotide-nucleotide comparison (-r) (PRIVATE).""" 342 self._parameters.sc_match = int(self._value)
343
345 """Mismatch penalty for nucleotide-nucleotide comparison (-r) (PRIVATE).""" 346 self._parameters.sc_mismatch = int(self._value)
347
348 - def _end_Parameters_gap_open(self):
349 """Gap existence cost (-G) (PRIVATE).""" 350 self._parameters.gap_penalties = int(self._value)
351
353 """Gap extension cose (-E) (PRIVATE).""" 354 self._parameters.gap_penalties = (self._parameters.gap_penalties, 355 int(self._value))
356
357 - def _end_Parameters_filter(self):
358 """Filtering options (-F) (PRIVATE).""" 359 self._parameters.filter = self._value
360 361 # def _end_Parameters_pattern(self): 362 # """Pattern used for phi-blast search 363 # """ 364 # pass # XXX TODO PSI 365 366 # def _end_Parameters_entrez_query(self): 367 # """Entrez query used to limit search 368 # """ 369 # pass # XXX TODO PSI 370 371 # Hits
372 - def _start_Hit(self):
373 self._blast.alignments.append(Record.Alignment()) 374 self._blast.descriptions.append(Record.Description()) 375 self._blast.multiple_alignment = [] 376 self._hit = self._blast.alignments[-1] 377 self._descr = self._blast.descriptions[-1] 378 self._descr.num_alignments = 0
379
380 - def _end_Hit(self):
381 # Cleanup 382 self._blast.multiple_alignment = None 383 self._hit = None 384 self._descr = None
385
386 - def _end_Hit_id(self):
387 """Identifier of the database sequence (PRIVATE).""" 388 self._hit.hit_id = self._value 389 self._hit.title = self._value + ' '
390
391 - def _end_Hit_def(self):
392 """Definition line of the database sequence (PRIVATE).""" 393 self._hit.hit_def = self._value 394 self._hit.title += self._value 395 self._descr.title = self._hit.title
396
397 - def _end_Hit_accession(self):
398 """Accession of the database sequence (PRIVATE).""" 399 self._hit.accession = self._value 400 self._descr.accession = self._value
401
402 - def _end_Hit_len(self):
403 self._hit.length = int(self._value)
404 405 # HSPs
406 - def _start_Hsp(self):
407 # Note that self._start_Hit() should have been called 408 # to setup things like self._blast.multiple_alignment 409 self._hit.hsps.append(Record.HSP()) 410 self._hsp = self._hit.hsps[-1] 411 self._descr.num_alignments += 1 412 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 413 self._mult_al = self._blast.multiple_alignment[-1]
414 415 # Hsp_num is useless
416 - def _end_Hsp_score(self):
417 """Raw score of HSP (PRIVATE).""" 418 self._hsp.score = float(self._value) 419 if self._descr.score is None: 420 self._descr.score = float(self._value)
421
422 - def _end_Hsp_bit_score(self):
423 """Bit score of HSP (PRIVATE).""" 424 self._hsp.bits = float(self._value) 425 if self._descr.bits is None: 426 self._descr.bits = float(self._value)
427
428 - def _end_Hsp_evalue(self):
429 """Expect value of the HSP (PRIVATE).""" 430 self._hsp.expect = float(self._value) 431 if self._descr.e is None: 432 self._descr.e = float(self._value)
433
434 - def _end_Hsp_query_from(self):
435 """Offset of query at the start of the alignment (one-offset) (PRIVATE).""" 436 self._hsp.query_start = int(self._value)
437
438 - def _end_Hsp_query_to(self):
439 """Offset of query at the end of the alignment (one-offset) (PRIVATE).""" 440 self._hsp.query_end = int(self._value)
441
442 - def _end_Hsp_hit_from(self):
443 """Offset of the database at the start of the alignment (one-offset) (PRIVATE).""" 444 self._hsp.sbjct_start = int(self._value)
445
446 - def _end_Hsp_hit_to(self):
447 """Offset of the database at the end of the alignment (one-offset) (PRIVATE).""" 448 self._hsp.sbjct_end = int(self._value)
449 450 # def _end_Hsp_pattern_from(self): 451 # """Start of phi-blast pattern on the query (one-offset) (PRIVATE).""" 452 # pass # XXX TODO PSI 453 454 # def _end_Hsp_pattern_to(self): 455 # """End of phi-blast pattern on the query (one-offset) (PRIVATE).""" 456 # pass # XXX TODO PSI 457
458 - def _end_Hsp_query_frame(self):
459 """Frame of the query if applicable (PRIVATE).""" 460 self._hsp.frame = (int(self._value),)
461
462 - def _end_Hsp_hit_frame(self):
463 """Frame of the database sequence if applicable (PRIVATE).""" 464 self._hsp.frame += (int(self._value),)
465
466 - def _end_Hsp_identity(self):
467 """Number of identities in the alignment (PRIVATE).""" 468 self._hsp.identities = int(self._value)
469
470 - def _end_Hsp_positive(self):
471 """Number of positive (conservative) substitutions in the alignment (PRIVATE).""" 472 self._hsp.positives = int(self._value)
473
474 - def _end_Hsp_gaps(self):
475 """Number of gaps in the alignment (PRIVATE).""" 476 self._hsp.gaps = int(self._value)
477
478 - def _end_Hsp_align_len(self):
479 """Length of the alignment (PRIVATE).""" 480 self._hsp.align_length = int(self._value)
481 482 # def _en_Hsp_density(self): 483 # """Score density (PRIVATE).""" 484 # pass # XXX ??? 485
486 - def _end_Hsp_qseq(self):
487 """Alignment string for the query (PRIVATE).""" 488 self._hsp.query = self._value
489
490 - def _end_Hsp_hseq(self):
491 """Alignment string for the database (PRIVATE).""" 492 self._hsp.sbjct = self._value
493
494 - def _end_Hsp_midline(self):
495 """Formatting middle line as normally seen in BLAST report (PRIVATE).""" 496 self._hsp.match = self._value # do NOT strip spaces! 497 assert len(self._hsp.match) == len(self._hsp.query) 498 assert len(self._hsp.match) == len(self._hsp.sbjct)
499 500 # Statistics
501 - def _end_Statistics_db_num(self):
502 """Number of sequences in the database (PRIVATE).""" 503 self._blast.num_sequences_in_database = int(self._value)
504
505 - def _end_Statistics_db_len(self):
506 """Number of letters in the database (PRIVATE).""" 507 self._blast.num_letters_in_database = int(self._value)
508
509 - def _end_Statistics_hsp_len(self):
510 """The effective HSP length (PRIVATE).""" 511 self._blast.effective_hsp_length = int(self._value)
512
514 """The effective search space (PRIVATE).""" 515 self._blast.effective_search_space = float(self._value)
516
517 - def _end_Statistics_kappa(self):
518 """Karlin-Altschul parameter K (PRIVATE).""" 519 self._blast.ka_params = float(self._value)
520
521 - def _end_Statistics_lambda(self):
522 """Karlin-Altschul parameter Lambda (PRIVATE).""" 523 self._blast.ka_params = (float(self._value), self._blast.ka_params)
524
525 - def _end_Statistics_entropy(self):
526 """Karlin-Altschul parameter H (PRIVATE).""" 527 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
528 529
530 -def read(handle, debug=0):
531 """Returns a single Blast record (assumes just one query). 532 533 Uses the BlastParser internally. 534 535 This function is for use when there is one and only one BLAST 536 result in your XML file. 537 538 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 539 one BLAST record (i.e. if you have more than one query sequence). 540 """ 541 iterator = parse(handle, debug) 542 try: 543 first = next(iterator) 544 except StopIteration: 545 first = None 546 if first is None: 547 raise ValueError("No records found in handle") 548 try: 549 second = next(iterator) 550 except StopIteration: 551 second = None 552 if second is not None: 553 raise ValueError("More than one record found in handle") 554 return first
555 556
557 -def parse(handle, debug=0):
558 """Returns an iterator a Blast record for each query. 559 560 Incremental parser, this is an iterator that returns 561 Blast records. It uses the BlastParser internally. 562 563 handle - file handle to and XML file to parse 564 debug - integer, amount of debug information to print 565 566 This is a generator function that returns multiple Blast records 567 objects - one for each query sequence given to blast. The file 568 is read incrementally, returning complete records as they are read 569 in. 570 571 Should cope with new BLAST 2.2.14+ which gives a single XML file 572 for multiple query records. 573 574 Should also cope with XML output from older versions BLAST which 575 gave multiple XML files concatenated together (giving a single file 576 which strictly speaking wasn't valid XML). 577 """ 578 from xml.parsers import expat 579 BLOCK = 1024 580 MARGIN = 10 # must be at least length of newline + XML start 581 XML_START = "<?xml" 582 583 text = handle.read(BLOCK) 584 pending = "" 585 586 if not text: 587 # NO DATA FOUND! 588 raise ValueError("Your XML file was empty") 589 590 while text: 591 # We are now starting a new XML file 592 if not text.startswith(XML_START): 593 raise ValueError("Your XML file did not start with %s... " 594 "but instead %s" 595 % (XML_START, repr(text[:20]))) 596 597 expat_parser = expat.ParserCreate() 598 blast_parser = BlastParser(debug) 599 expat_parser.StartElementHandler = blast_parser.startElement 600 expat_parser.EndElementHandler = blast_parser.endElement 601 expat_parser.CharacterDataHandler = blast_parser.characters 602 603 expat_parser.Parse(text, False) 604 while blast_parser._records: 605 record = blast_parser._records[0] 606 blast_parser._records = blast_parser._records[1:] 607 yield record 608 609 while True: 610 # Read in another block of the file... 611 text, pending = pending + handle.read(BLOCK), "" 612 if not text: 613 # End of the file! 614 expat_parser.Parse("", True) # End of XML record 615 break 616 617 # Now read a little bit more so we can check for the 618 # start of another XML file... 619 pending = handle.read(MARGIN) 620 621 if ("\n" + XML_START) not in (text + pending): 622 # Good - still dealing with the same XML file 623 expat_parser.Parse(text, False) 624 while blast_parser._records: 625 yield blast_parser._records.pop(0) 626 else: 627 # This is output from pre 2.2.14 BLAST, 628 # one XML file for each query! 629 630 # Finish the old file: 631 text, pending = (text + pending).split("\n" + XML_START, 1) 632 pending = XML_START + pending 633 634 expat_parser.Parse(text, True) # End of XML record 635 while blast_parser._records: 636 yield blast_parser._records.pop(0) 637 638 # Now we are going to re-loop, reset the 639 # parsers and start reading the next XML file 640 text, pending = pending, "" 641 break 642 643 # this was added because it seems that the Jython expat parser 644 # was adding records later then the Python one 645 while blast_parser._records: 646 yield blast_parser._records.pop(0) 647 648 # At this point we have finished the first XML record. 649 # If the file is from an old version of blast, it may 650 # contain more XML records (check if text==""). 651 assert pending == "" 652 assert len(blast_parser._records) == 0 653 654 # We should have finished the file! 655 assert text == "" 656 assert pending == "" 657 assert len(blast_parser._records) == 0
658