Package Bio :: Package Blast :: Module NCBIXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIXML

  1  # Copyright 2000 by Bertrand Frottier .  All rights reserved. 
  2  # Revisions 2005-2006 copyright Michiel de Hoon 
  3  # Revisions 2006-2009 copyright Peter Cock 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Code to work with the BLAST XML output. 
  8   
  9  The BLAST XML DTD file is on the NCBI FTP site at: 
 10  ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd 
 11  """ 
 12  from __future__ import print_function 
 13   
 14  from Bio.Blast import Record 
 15  import xml.sax 
 16  from xml.sax.handler import ContentHandler 
 17  from functools import reduce 
 18   
 19  __docformat__ = "restructuredtext en" 
 20   
 21   
22 -class _XMLparser(ContentHandler):
23 """Generic SAX Parser (PRIVATE). 24 25 Just a very basic SAX parser. 26 27 Redefine the methods startElement, characters and endElement. 28 """
29 - def __init__(self, debug=0):
30 """Constructor. 31 32 debug - integer, amount of debug information to print 33 """ 34 self._tag = [] 35 self._value = '' 36 self._debug = debug 37 self._debug_ignore_list = []
38
39 - def _secure_name(self, name):
40 """Removes 'dangerous' from tag names. 41 42 name -- name to be 'secured' 43 """ 44 # Replace '-' with '_' in XML tag names 45 return name.replace('-', '_')
46
47 - def startElement(self, name, attr):
48 """Found XML start tag. 49 50 No real need of attr, BLAST DTD doesn't use them 51 52 name -- name of the tag 53 54 attr -- tag attributes 55 """ 56 self._tag.append(name) 57 58 # Try to call a method (defined in subclasses) 59 method = self._secure_name('_start_' + name) 60 61 # Note could use try / except AttributeError 62 # BUT I found often triggered by nested errors... 63 if hasattr(self, method): 64 eval("self.%s()" % method) 65 if self._debug > 4: 66 print("NCBIXML: Parsed: " + method) 67 elif self._debug > 3: 68 # Doesn't exist (yet) and may want to warn about it 69 if method not in self._debug_ignore_list: 70 print("NCBIXML: Ignored: " + method) 71 self._debug_ignore_list.append(method) 72 73 # We don't care about white space in parent tags like Hsp, 74 # but that white space doesn't belong to child tags like Hsp_midline 75 if self._value.strip(): 76 raise ValueError("What should we do with %s before the %s tag?" 77 % (repr(self._value), name)) 78 self._value = ""
79
80 - def characters(self, ch):
81 """Found some text. 82 83 ch -- characters read 84 """ 85 self._value += ch # You don't ever get the whole string
86
87 - def endElement(self, name):
88 """Found XML end tag. 89 90 name -- tag name 91 """ 92 # DON'T strip any white space, we may need it e.g. the hsp-midline 93 94 # Try to call a method (defined in subclasses) 95 method = self._secure_name('_end_' + name) 96 # Note could use try / except AttributeError 97 # BUT I found often triggered by nested errors... 98 if hasattr(self, method): 99 eval("self.%s()" % method) 100 if self._debug > 2: 101 print("NCBIXML: Parsed: %s %s" % (method, self._value)) 102 elif self._debug > 1: 103 # Doesn't exist (yet) and may want to warn about it 104 if method not in self._debug_ignore_list: 105 print("NCBIXML: Ignored: %s %s" % (method, self._value)) 106 self._debug_ignore_list.append(method) 107 108 # Reset character buffer 109 self._value = ''
110 111
112 -class BlastParser(_XMLparser):
113 """Parse XML BLAST data into a Record.Blast object. 114 115 Parses XML output from BLAST (direct use discouraged). 116 This (now) returns a list of Blast records. 117 Historically it returned a single Blast record. 118 You are expected to use this via the parse or read functions. 119 120 All XML 'action' methods are private methods and may be: 121 - ``_start_TAG`` called when the start tag is found 122 - ``_end_TAG`` called when the end tag is found 123 """ 124
125 - def __init__(self, debug=0):
126 """Constructor. 127 128 debug - integer, amount of debug information to print 129 """ 130 # Calling superclass method 131 _XMLparser.__init__(self, debug) 132 133 self._parser = xml.sax.make_parser() 134 self._parser.setContentHandler(self) 135 136 # To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd 137 self._parser.setFeature(xml.sax.handler.feature_validation, 0) 138 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0) 139 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0) 140 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0) 141 142 self.reset()
143
144 - def reset(self):
145 """Reset all the data allowing reuse of the BlastParser() object.""" 146 self._records = [] 147 self._header = Record.Header() 148 self._parameters = Record.Parameters() 149 self._parameters.filter = None # Maybe I should update the class?
150
151 - def _start_Iteration(self):
152 self._blast = Record.Blast() 153 pass
154
155 - def _end_Iteration(self):
156 # We stored a lot of generic "top level" information 157 # in self._header (an object of type Record.Header) 158 self._blast.reference = self._header.reference 159 self._blast.date = self._header.date 160 self._blast.version = self._header.version 161 self._blast.database = self._header.database 162 self._blast.application = self._header.application 163 164 # These are required for "old" pre 2.2.14 files 165 # where only <BlastOutput_query-ID>, <BlastOutput_query-def> 166 # and <BlastOutput_query-len> were used. Now they 167 # are suplemented/replaced by <Iteration_query-ID>, 168 # <Iteration_query-def> and <Iteration_query-len> 169 if not hasattr(self._blast, "query") \ 170 or not self._blast.query: 171 self._blast.query = self._header.query 172 if not hasattr(self._blast, "query_id") \ 173 or not self._blast.query_id: 174 self._blast.query_id = self._header.query_id 175 if not hasattr(self._blast, "query_letters") \ 176 or not self._blast.query_letters: 177 self._blast.query_letters = self._header.query_letters 178 179 # Hack to record the query length as both the query_letters and 180 # query_length properties (as in the plain text parser, see 181 # Bug 2176 comment 12): 182 self._blast.query_length = self._blast.query_letters 183 # Perhaps in the long term we should deprecate one, but I would 184 # prefer to drop query_letters - so we need a transition period 185 # with both. 186 187 # Hack to record the claimed database size as database_length 188 # (as well as in num_letters_in_database, see Bug 2176 comment 13): 189 self._blast.database_length = self._blast.num_letters_in_database 190 # TODO? Deprecate database_letters next? 191 192 # Hack to record the claimed database sequence count as database_sequences 193 self._blast.database_sequences = self._blast.num_sequences_in_database 194 195 # Apply the "top level" parameter information 196 self._blast.matrix = self._parameters.matrix 197 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e 198 self._blast.gap_penalties = self._parameters.gap_penalties 199 self._blast.filter = self._parameters.filter 200 self._blast.expect = self._parameters.expect 201 self._blast.sc_match = self._parameters.sc_match 202 self._blast.sc_mismatch = self._parameters.sc_mismatch 203 204 # Add to the list 205 self._records.append(self._blast) 206 # Clear the object (a new empty one is create in _start_Iteration) 207 self._blast = None 208 209 if self._debug: 210 print("NCBIXML: Added Blast record to results")
211 212 # Header
213 - def _end_BlastOutput_program(self):
214 """BLAST program, e.g., blastp, blastn, etc. 215 216 Save this to put on each blast record object 217 """ 218 self._header.application = self._value.upper()
219
220 - def _end_BlastOutput_version(self):
221 """version number and date of the BLAST engine. 222 223 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be 224 variants like "BLASTP 2.2.18+" without the date. 225 226 Save this to put on each blast record object 227 """ 228 parts = self._value.split() 229 # TODO - Check the first word starts with BLAST? 230 231 # The version is the second word (field one) 232 self._header.version = parts[1] 233 234 # Check there is a third word (the date) 235 if len(parts) >= 3: 236 if parts[2][0] == "[" and parts[2][-1] == "]": 237 self._header.date = parts[2][1:-1] 238 else: 239 # Assume this is still a date, but without the 240 # square brackets 241 self._header.date = parts[2]
242
244 """a reference to the article describing the algorithm (PRIVATE). 245 246 Save this to put on each blast record object 247 """ 248 self._header.reference = self._value
249
250 - def _end_BlastOutput_db(self):
251 """the database(s) searched (PRIVATE). 252 253 Save this to put on each blast record object 254 """ 255 self._header.database = self._value
256
258 """the identifier of the query (PRIVATE). 259 260 Important in old pre 2.2.14 BLAST, for recent versions 261 <Iteration_query-ID> is enough 262 """ 263 self._header.query_id = self._value
264
266 """the definition line of the query (PRIVATE). 267 268 Important in old pre 2.2.14 BLAST, for recent versions 269 <Iteration_query-def> is enough 270 """ 271 self._header.query = self._value
272
274 """the length of the query (PRIVATE). 275 276 Important in old pre 2.2.14 BLAST, for recent versions 277 <Iteration_query-len> is enough 278 """ 279 self._header.query_letters = int(self._value)
280
281 - def _end_Iteration_query_ID(self):
282 """the identifier of the query (PRIVATE).""" 283 self._blast.query_id = self._value
284
285 - def _end_Iteration_query_def(self):
286 """the definition line of the query (PRIVATE).""" 287 self._blast.query = self._value
288
289 - def _end_Iteration_query_len(self):
290 """the length of the query (PRIVATE).""" 291 self._blast.query_letters = int(self._value)
292 293 # def _end_BlastOutput_query_seq(self): 294 # """the query sequence (PRIVATE).""" 295 # pass # XXX Missing in Record.Blast ? 296 297 # def _end_BlastOutput_iter_num(self): 298 # """the psi-blast iteration number (PRIVATE).""" 299 # pass # XXX TODO PSI 300
301 - def _end_BlastOutput_hits(self):
302 """hits to the database sequences, one for every sequence (PRIVATE).""" 303 self._blast.num_hits = int(self._value)
304 305 # def _end_BlastOutput_message(self): 306 # """error messages (PRIVATE).""" 307 # pass # XXX What to do ? 308 309 # Parameters
310 - def _end_Parameters_matrix(self):
311 """matrix used (-M on legacy BLAST) (PRIVATE).""" 312 self._parameters.matrix = self._value
313
314 - def _end_Parameters_expect(self):
315 """expect values cutoff (PRIVATE).""" 316 # NOTE: In old text output there was a line: 317 # Number of sequences better than 1.0e-004: 1 318 # As far as I can see, parameters.num_seqs_better_e 319 # would take the value of 1, and the expectation 320 # value was not recorded. 321 # 322 # Anyway we should NOT record this against num_seqs_better_e 323 self._parameters.expect = self._value
324 325 # def _end_Parameters_include(self): 326 # """inclusion threshold for a psi-blast iteration (-h) (PRIVATE).""" 327 # pass # XXX TODO PSI 328
329 - def _end_Parameters_sc_match(self):
330 """match score for nucleotide-nucleotide comparaison (-r) (PRIVATE).""" 331 self._parameters.sc_match = int(self._value)
332
334 """mismatch penalty for nucleotide-nucleotide comparaison (-r) (PRIVATE).""" 335 self._parameters.sc_mismatch = int(self._value)
336
337 - def _end_Parameters_gap_open(self):
338 """gap existence cost (-G) (PRIVATE).""" 339 self._parameters.gap_penalties = int(self._value)
340
342 """gap extension cose (-E) (PRIVATE).""" 343 self._parameters.gap_penalties = (self._parameters.gap_penalties, 344 int(self._value))
345
346 - def _end_Parameters_filter(self):
347 """filtering options (-F) (PRIVATE).""" 348 self._parameters.filter = self._value
349 350 # def _end_Parameters_pattern(self): 351 # """pattern used for phi-blast search 352 # """ 353 # pass # XXX TODO PSI 354 355 # def _end_Parameters_entrez_query(self): 356 # """entrez query used to limit search 357 # """ 358 # pass # XXX TODO PSI 359 360 # Hits
361 - def _start_Hit(self):
362 self._blast.alignments.append(Record.Alignment()) 363 self._blast.descriptions.append(Record.Description()) 364 self._blast.multiple_alignment = [] 365 self._hit = self._blast.alignments[-1] 366 self._descr = self._blast.descriptions[-1] 367 self._descr.num_alignments = 0
368
369 - def _end_Hit(self):
370 # Cleanup 371 self._blast.multiple_alignment = None 372 self._hit = None 373 self._descr = None
374
375 - def _end_Hit_id(self):
376 """identifier of the database sequence (PRIVATE).""" 377 self._hit.hit_id = self._value 378 self._hit.title = self._value + ' '
379
380 - def _end_Hit_def(self):
381 """definition line of the database sequence (PRIVATE).""" 382 self._hit.hit_def = self._value 383 self._hit.title += self._value 384 self._descr.title = self._hit.title
385
386 - def _end_Hit_accession(self):
387 """accession of the database sequence (PRIVATE).""" 388 self._hit.accession = self._value 389 self._descr.accession = self._value
390
391 - def _end_Hit_len(self):
392 self._hit.length = int(self._value)
393 394 # HSPs
395 - def _start_Hsp(self):
396 # Note that self._start_Hit() should have been called 397 # to setup things like self._blast.multiple_alignment 398 self._hit.hsps.append(Record.HSP()) 399 self._hsp = self._hit.hsps[-1] 400 self._descr.num_alignments += 1 401 self._blast.multiple_alignment.append(Record.MultipleAlignment()) 402 self._mult_al = self._blast.multiple_alignment[-1]
403 404 # Hsp_num is useless
405 - def _end_Hsp_score(self):
406 """raw score of HSP (PRIVATE).""" 407 self._hsp.score = float(self._value) 408 if self._descr.score is None: 409 self._descr.score = float(self._value)
410
411 - def _end_Hsp_bit_score(self):
412 """bit score of HSP (PRIVATE).""" 413 self._hsp.bits = float(self._value) 414 if self._descr.bits is None: 415 self._descr.bits = float(self._value)
416
417 - def _end_Hsp_evalue(self):
418 """expect value of the HSP (PRIVATE).""" 419 self._hsp.expect = float(self._value) 420 if self._descr.e is None: 421 self._descr.e = float(self._value)
422
423 - def _end_Hsp_query_from(self):
424 """offset of query at the start of the alignment (one-offset) (PRIVATE).""" 425 self._hsp.query_start = int(self._value)
426
427 - def _end_Hsp_query_to(self):
428 """offset of query at the end of the alignment (one-offset) (PRIVATE).""" 429 self._hsp.query_end = int(self._value)
430
431 - def _end_Hsp_hit_from(self):
432 """offset of the database at the start of the alignment (one-offset) (PRIVATE).""" 433 self._hsp.sbjct_start = int(self._value)
434
435 - def _end_Hsp_hit_to(self):
436 """offset of the database at the end of the alignment (one-offset) (PRIVATE).""" 437 self._hsp.sbjct_end = int(self._value)
438 439 # def _end_Hsp_pattern_from(self): 440 # """start of phi-blast pattern on the query (one-offset) (PRIVATE).""" 441 # pass # XXX TODO PSI 442 443 # def _end_Hsp_pattern_to(self): 444 # """end of phi-blast pattern on the query (one-offset) (PRIVATE).""" 445 # pass # XXX TODO PSI 446
447 - def _end_Hsp_query_frame(self):
448 """frame of the query if applicable (PRIVATE).""" 449 self._hsp.frame = (int(self._value),)
450
451 - def _end_Hsp_hit_frame(self):
452 """frame of the database sequence if applicable (PRIVATE).""" 453 self._hsp.frame += (int(self._value),)
454
455 - def _end_Hsp_identity(self):
456 """number of identities in the alignment (PRIVATE).""" 457 self._hsp.identities = int(self._value)
458
459 - def _end_Hsp_positive(self):
460 """number of positive (conservative) substitutions in the alignment (PRIVATE).""" 461 self._hsp.positives = int(self._value)
462
463 - def _end_Hsp_gaps(self):
464 """number of gaps in the alignment (PRIVATE).""" 465 self._hsp.gaps = int(self._value)
466
467 - def _end_Hsp_align_len(self):
468 """length of the alignment (PRIVATE).""" 469 self._hsp.align_length = int(self._value)
470 471 # def _en_Hsp_density(self): 472 # """score density (PRIVATE).""" 473 # pass # XXX ??? 474
475 - def _end_Hsp_qseq(self):
476 """alignment string for the query (PRIVATE).""" 477 self._hsp.query = self._value
478
479 - def _end_Hsp_hseq(self):
480 """alignment string for the database (PRIVATE).""" 481 self._hsp.sbjct = self._value
482
483 - def _end_Hsp_midline(self):
484 """Formatting middle line as normally seen in BLAST report (PRIVATE).""" 485 self._hsp.match = self._value # do NOT strip spaces! 486 assert len(self._hsp.match) == len(self._hsp.query) 487 assert len(self._hsp.match) == len(self._hsp.sbjct)
488 489 # Statistics
490 - def _end_Statistics_db_num(self):
491 """number of sequences in the database (PRIVATE).""" 492 self._blast.num_sequences_in_database = int(self._value)
493
494 - def _end_Statistics_db_len(self):
495 """number of letters in the database (PRIVATE).""" 496 self._blast.num_letters_in_database = int(self._value)
497
498 - def _end_Statistics_hsp_len(self):
499 """the effective HSP length (PRIVATE).""" 500 self._blast.effective_hsp_length = int(self._value)
501
503 """the effective search space (PRIVATE).""" 504 self._blast.effective_search_space = float(self._value)
505
506 - def _end_Statistics_kappa(self):
507 """Karlin-Altschul parameter K (PRIVATE).""" 508 self._blast.ka_params = float(self._value)
509
510 - def _end_Statistics_lambda(self):
511 """Karlin-Altschul parameter Lambda (PRIVATE).""" 512 self._blast.ka_params = (float(self._value), self._blast.ka_params)
513
514 - def _end_Statistics_entropy(self):
515 """Karlin-Altschul parameter H (PRIVATE).""" 516 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
517 518
519 -def read(handle, debug=0):
520 """Returns a single Blast record (assumes just one query). 521 522 Uses the BlastParser internally. 523 524 This function is for use when there is one and only one BLAST 525 result in your XML file. 526 527 Use the Bio.Blast.NCBIXML.parse() function if you expect more than 528 one BLAST record (i.e. if you have more than one query sequence). 529 """ 530 iterator = parse(handle, debug) 531 try: 532 first = next(iterator) 533 except StopIteration: 534 first = None 535 if first is None: 536 raise ValueError("No records found in handle") 537 try: 538 second = next(iterator) 539 except StopIteration: 540 second = None 541 if second is not None: 542 raise ValueError("More than one record found in handle") 543 return first
544 545
546 -def parse(handle, debug=0):
547 """Returns an iterator a Blast record for each query. 548 549 Incremental parser, this is an iterator that returns 550 Blast records. It uses the BlastParser internally. 551 552 handle - file handle to and XML file to parse 553 debug - integer, amount of debug information to print 554 555 This is a generator function that returns multiple Blast records 556 objects - one for each query sequence given to blast. The file 557 is read incrementally, returning complete records as they are read 558 in. 559 560 Should cope with new BLAST 2.2.14+ which gives a single XML file 561 for multiple query records. 562 563 Should also cope with XML output from older versions BLAST which 564 gave multiple XML files concatenated together (giving a single file 565 which strictly speaking wasn't valid XML). 566 """ 567 from xml.parsers import expat 568 BLOCK = 1024 569 MARGIN = 10 # must be at least length of newline + XML start 570 XML_START = "<?xml" 571 572 text = handle.read(BLOCK) 573 pending = "" 574 575 if not text: 576 # NO DATA FOUND! 577 raise ValueError("Your XML file was empty") 578 579 while text: 580 # We are now starting a new XML file 581 if not text.startswith(XML_START): 582 raise ValueError("Your XML file did not start with %s... " 583 "but instead %s" 584 % (XML_START, repr(text[:20]))) 585 586 expat_parser = expat.ParserCreate() 587 blast_parser = BlastParser(debug) 588 expat_parser.StartElementHandler = blast_parser.startElement 589 expat_parser.EndElementHandler = blast_parser.endElement 590 expat_parser.CharacterDataHandler = blast_parser.characters 591 592 expat_parser.Parse(text, False) 593 while blast_parser._records: 594 record = blast_parser._records[0] 595 blast_parser._records = blast_parser._records[1:] 596 yield record 597 598 while True: 599 # Read in another block of the file... 600 text, pending = pending + handle.read(BLOCK), "" 601 if not text: 602 # End of the file! 603 expat_parser.Parse("", True) # End of XML record 604 break 605 606 # Now read a little bit more so we can check for the 607 # start of another XML file... 608 pending = handle.read(MARGIN) 609 610 if ("\n" + XML_START) not in (text + pending): 611 # Good - still dealing with the same XML file 612 expat_parser.Parse(text, False) 613 while blast_parser._records: 614 yield blast_parser._records.pop(0) 615 else: 616 # This is output from pre 2.2.14 BLAST, 617 # one XML file for each query! 618 619 # Finish the old file: 620 text, pending = (text + pending).split("\n" + XML_START, 1) 621 pending = XML_START + pending 622 623 expat_parser.Parse(text, True) # End of XML record 624 while blast_parser._records: 625 yield blast_parser._records.pop(0) 626 627 # Now we are going to re-loop, reset the 628 # parsers and start reading the next XML file 629 text, pending = pending, "" 630 break 631 632 # this was added because it seems that the Jython expat parser 633 # was adding records later then the Python one 634 while blast_parser._records: 635 yield blast_parser._records.pop(0) 636 637 # At this point we have finished the first XML record. 638 # If the file is from an old version of blast, it may 639 # contain more XML records (check if text==""). 640 assert pending == "" 641 assert len(blast_parser._records) == 0 642 643 # We should have finished the file! 644 assert text == "" 645 assert pending == "" 646 assert len(blast_parser._records) == 0
647 648 if __name__ == '__main__': 649 import sys 650 with open(sys.argv[1]) as handle: 651 r_list = parse(handle) 652 653 for r in r_list: 654 # Small test 655 print('Blast of %s' % r.query) 656 print('Found %s alignments with a total of %s HSPs' 657 % (len(r.alignments), 658 reduce(lambda a, b: a + b, 659 [len(a.hsps) for a in r.alignments]))) 660 661 for al in r.alignments: 662 print("%s %i bp %i HSPs" % (al.title[:50], al.length, len(al.hsps))) 663 664 # Cookbook example 665 E_VALUE_THRESH = 0.04 666 for alignment in r.alignments: 667 for hsp in alignment.hsps: 668 if hsp.expect < E_VALUE_THRESH: 669 print('*****') 670 print('sequence %s' % alignment.title) 671 print('length %i' % alignment.length) 672 print('e value %f' % hsp.expect) 673 print(hsp.query[:75] + '...') 674 print(hsp.match[:75] + '...') 675 print(hsp.sbjct[:75] + '...') 676