1
2
3
4
5
6
7 """This module provides code to work with the BLAST XML output
8 following the DTD available on the NCBI FTP
9 ftp://ftp.ncbi.nlm.nih.gov/blast/documents/xml/NCBI_BlastOutput.dtd
10
11 Classes:
12 BlastParser Parses XML output from BLAST (direct use discouraged).
13 This (now) returns a list of Blast records.
14 Historically it returned a single Blast record.
15 You are expected to use this via the parse or read functions.
16
17 _XMLParser Generic SAX parser (private).
18
19 Functions:
20 parse Incremental parser, this is an iterator that returns
21 Blast records. It uses the BlastParser internally.
22 read Returns a single Blast record. Uses the BlastParser internally.
23 """
24 from Bio.Blast import Record
25 import xml.sax
26 from xml.sax.handler import ContentHandler
27
28
30 """Generic SAX Parser
31
32 Just a very basic SAX parser.
33
34 Redefine the methods startElement, characters and endElement.
35 """
37 """Constructor
38
39 debug - integer, amount of debug information to print
40 """
41 self._tag = []
42 self._value = ''
43 self._debug = debug
44 self._debug_ignore_list = []
45
47 """Removes 'dangerous' from tag names
48
49 name -- name to be 'secured'
50 """
51
52 return name.replace('-', '_')
53
55 """Found XML start tag
56
57 No real need of attr, BLAST DTD doesn't use them
58
59 name -- name of the tag
60
61 attr -- tag attributes
62 """
63 self._tag.append(name)
64
65
66 method = self._secure_name('_start_' + name)
67
68
69
70 if hasattr(self, method):
71 eval("self.%s()" % method)
72 if self._debug > 4:
73 print "NCBIXML: Parsed: " + method
74 elif self._debug > 3:
75
76 if method not in self._debug_ignore_list:
77 print "NCBIXML: Ignored: " + method
78 self._debug_ignore_list.append(method)
79
80
81
82 if self._value.strip():
83 raise ValueError("What should we do with %s before the %s tag?"
84 % (repr(self._value), name))
85 self._value = ""
86
88 """Found some text
89
90 ch -- characters read
91 """
92 self._value += ch
93
95 """Found XML end tag
96
97 name -- tag name
98 """
99
100
101
102 method = self._secure_name('_end_' + name)
103
104
105 if hasattr(self, method):
106 eval("self.%s()" % method)
107 if self._debug > 2:
108 print "NCBIXML: Parsed: " + method, self._value
109 elif self._debug > 1:
110
111 if method not in self._debug_ignore_list:
112 print "NCBIXML: Ignored: " + method, self._value
113 self._debug_ignore_list.append(method)
114
115
116 self._value = ''
117
118
120 """Parse XML BLAST data into a Record.Blast object
121
122 All XML 'action' methods are private methods and may be:
123 _start_TAG called when the start tag is found
124 _end_TAG called when the end tag is found
125 """
126
128 """Constructor
129
130 debug - integer, amount of debug information to print
131 """
132
133 _XMLparser.__init__(self, debug)
134
135 self._parser = xml.sax.make_parser()
136 self._parser.setContentHandler(self)
137
138
139 self._parser.setFeature(xml.sax.handler.feature_validation, 0)
140 self._parser.setFeature(xml.sax.handler.feature_namespaces, 0)
141 self._parser.setFeature(xml.sax.handler.feature_external_pes, 0)
142 self._parser.setFeature(xml.sax.handler.feature_external_ges, 0)
143
144 self.reset()
145
147 """Reset all the data allowing reuse of the BlastParser() object"""
148 self._records = []
149 self._header = Record.Header()
150 self._parameters = Record.Parameters()
151 self._parameters.filter = None
152
156
158
159
160 self._blast.reference = self._header.reference
161 self._blast.date = self._header.date
162 self._blast.version = self._header.version
163 self._blast.database = self._header.database
164 self._blast.application = self._header.application
165
166
167
168
169
170
171 if not hasattr(self._blast, "query") \
172 or not self._blast.query:
173 self._blast.query = self._header.query
174 if not hasattr(self._blast, "query_id") \
175 or not self._blast.query_id:
176 self._blast.query_id = self._header.query_id
177 if not hasattr(self._blast, "query_letters") \
178 or not self._blast.query_letters:
179 self._blast.query_letters = self._header.query_letters
180
181
182
183
184 self._blast.query_length = self._blast.query_letters
185
186
187
188
189
190
191 self._blast.database_length = self._blast.num_letters_in_database
192
193
194
195 self._blast.database_sequences = self._blast.num_sequences_in_database
196
197
198 self._blast.matrix = self._parameters.matrix
199 self._blast.num_seqs_better_e = self._parameters.num_seqs_better_e
200 self._blast.gap_penalties = self._parameters.gap_penalties
201 self._blast.filter = self._parameters.filter
202 self._blast.expect = self._parameters.expect
203 self._blast.sc_match = self._parameters.sc_match
204 self._blast.sc_mismatch = self._parameters.sc_mismatch
205
206
207 self._records.append(self._blast)
208
209 self._blast = None
210
211 if self._debug:
212 print "NCBIXML: Added Blast record to results"
213
214
216 """BLAST program, e.g., blastp, blastn, etc.
217
218 Save this to put on each blast record object
219 """
220 self._header.application = self._value.upper()
221
223 """version number and date of the BLAST engine.
224
225 e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be
226 variants like "BLASTP 2.2.18+" without the date.
227
228 Save this to put on each blast record object
229 """
230 parts = self._value.split()
231
232
233
234 self._header.version = parts[1]
235
236
237 if len(parts) >= 3:
238 if parts[2][0] == "[" and parts[2][-1] == "]":
239 self._header.date = parts[2][1:-1]
240 else:
241
242
243 self._header.date = parts[2]
244
246 """a reference to the article describing the algorithm
247
248 Save this to put on each blast record object
249 """
250 self._header.reference = self._value
251
253 """the database(s) searched
254
255 Save this to put on each blast record object
256 """
257 self._header.database = self._value
258
260 """the identifier of the query
261
262 Important in old pre 2.2.14 BLAST, for recent versions
263 <Iteration_query-ID> is enough
264 """
265 self._header.query_id = self._value
266
268 """the definition line of the query
269
270 Important in old pre 2.2.14 BLAST, for recent versions
271 <Iteration_query-def> is enough
272 """
273 self._header.query = self._value
274
276 """the length of the query
277
278 Important in old pre 2.2.14 BLAST, for recent versions
279 <Iteration_query-len> is enough
280 """
281 self._header.query_letters = int(self._value)
282
284 """the identifier of the query
285 """
286 self._blast.query_id = self._value
287
289 """the definition line of the query
290 """
291 self._blast.query = self._value
292
294 """the length of the query
295 """
296 self._blast.query_letters = int(self._value)
297
298
299
300
301
302
303
304
305
306
307
309 """hits to the database sequences, one for every sequence
310 """
311 self._blast.num_hits = int(self._value)
312
313
314
315
316
317
318
320 """matrix used (-M)
321 """
322 self._parameters.matrix = self._value
323
325 """expect values cutoff (-e)
326 """
327
328
329
330
331
332
333
334 self._parameters.expect = self._value
335
336
337
338
339
340
342 """match score for nucleotide-nucleotide comparaison (-r)
343 """
344 self._parameters.sc_match = int(self._value)
345
347 """mismatch penalty for nucleotide-nucleotide comparaison (-r)
348 """
349 self._parameters.sc_mismatch = int(self._value)
350
352 """gap existence cost (-G)
353 """
354 self._parameters.gap_penalties = int(self._value)
355
357 """gap extension cose (-E)
358 """
359 self._parameters.gap_penalties = (self._parameters.gap_penalties,
360 int(self._value))
361
363 """filtering options (-F)
364 """
365 self._parameters.filter = self._value
366
367
368
369
370
371
372
373
374
375
376
377
379 self._blast.alignments.append(Record.Alignment())
380 self._blast.descriptions.append(Record.Description())
381 self._blast.multiple_alignment = []
382 self._hit = self._blast.alignments[-1]
383 self._descr = self._blast.descriptions[-1]
384 self._descr.num_alignments = 0
385
387
388 self._blast.multiple_alignment = None
389 self._hit = None
390 self._descr = None
391
393 """identifier of the database sequence
394 """
395 self._hit.hit_id = self._value
396 self._hit.title = self._value + ' '
397
399 """definition line of the database sequence
400 """
401 self._hit.hit_def = self._value
402 self._hit.title += self._value
403 self._descr.title = self._hit.title
404
406 """accession of the database sequence
407 """
408 self._hit.accession = self._value
409 self._descr.accession = self._value
410
412 self._hit.length = int(self._value)
413
414
423
424
426 """raw score of HSP
427 """
428 self._hsp.score = float(self._value)
429 if self._descr.score is None:
430 self._descr.score = float(self._value)
431
433 """bit score of HSP
434 """
435 self._hsp.bits = float(self._value)
436 if self._descr.bits is None:
437 self._descr.bits = float(self._value)
438
440 """expect value of the HSP
441 """
442 self._hsp.expect = float(self._value)
443 if self._descr.e is None:
444 self._descr.e = float(self._value)
445
447 """offset of query at the start of the alignment (one-offset)
448 """
449 self._hsp.query_start = int(self._value)
450
452 """offset of query at the end of the alignment (one-offset)
453 """
454 self._hsp.query_end = int(self._value)
455
457 """offset of the database at the start of the alignment (one-offset)
458 """
459 self._hsp.sbjct_start = int(self._value)
460
462 """offset of the database at the end of the alignment (one-offset)
463 """
464 self._hsp.sbjct_end = int(self._value)
465
466
467
468
469
470
471
472
473
474
475
477 """frame of the query if applicable
478 """
479 self._hsp.frame = (int(self._value),)
480
482 """frame of the database sequence if applicable
483 """
484 self._hsp.frame += (int(self._value),)
485
487 """number of identities in the alignment
488 """
489 self._hsp.identities = int(self._value)
490
492 """number of positive (conservative) substitutions in the alignment
493 """
494 self._hsp.positives = int(self._value)
495
497 """number of gaps in the alignment
498 """
499 self._hsp.gaps = int(self._value)
500
502 """length of the alignment
503 """
504 self._hsp.align_length = int(self._value)
505
506
507
508
509
510
512 """alignment string for the query
513 """
514 self._hsp.query = self._value
515
517 """alignment string for the database
518 """
519 self._hsp.sbjct = self._value
520
522 """Formatting middle line as normally seen in BLAST report
523 """
524 self._hsp.match = self._value
525 assert len(self._hsp.match)==len(self._hsp.query)
526 assert len(self._hsp.match)==len(self._hsp.sbjct)
527
528
533
538
543
548
550 """Karlin-Altschul parameter K
551 """
552 self._blast.ka_params = float(self._value)
553
555 """Karlin-Altschul parameter Lambda
556 """
557 self._blast.ka_params = (float(self._value),
558 self._blast.ka_params)
559
561 """Karlin-Altschul parameter H
562 """
563 self._blast.ka_params = self._blast.ka_params + (float(self._value),)
564
565
566 -def read(handle, debug=0):
567 """Returns a single Blast record (assumes just one query).
568
569 This function is for use when there is one and only one BLAST
570 result in your XML file.
571
572 Use the Bio.Blast.NCBIXML.parse() function if you expect more than
573 one BLAST record (i.e. if you have more than one query sequence).
574
575 """
576 iterator = parse(handle, debug)
577 try:
578 first = iterator.next()
579 except StopIteration:
580 first = None
581 if first is None:
582 raise ValueError("No records found in handle")
583 try:
584 second = iterator.next()
585 except StopIteration:
586 second = None
587 if second is not None:
588 raise ValueError("More than one record found in handle")
589 return first
590
591
592 -def parse(handle, debug=0):
593 """Returns an iterator a Blast record for each query.
594
595 handle - file handle to and XML file to parse
596 debug - integer, amount of debug information to print
597
598 This is a generator function that returns multiple Blast records
599 objects - one for each query sequence given to blast. The file
600 is read incrementally, returning complete records as they are read
601 in.
602
603 Should cope with new BLAST 2.2.14+ which gives a single XML file
604 for multiple query records.
605
606 Should also cope with XML output from older versions BLAST which
607 gave multiple XML files concatenated together (giving a single file
608 which strictly speaking wasn't valid XML)."""
609 from xml.parsers import expat
610 BLOCK = 1024
611 MARGIN = 10
612 XML_START = "<?xml"
613
614 text = handle.read(BLOCK)
615 pending = ""
616
617 if not text:
618
619 raise ValueError("Your XML file was empty")
620
621 while text:
622
623 if not text.startswith(XML_START):
624 raise ValueError("Your XML file did not start with %s... "
625 "but instead %s"
626 % (XML_START, repr(text[:20])))
627
628 expat_parser = expat.ParserCreate()
629 blast_parser = BlastParser(debug)
630 expat_parser.StartElementHandler = blast_parser.startElement
631 expat_parser.EndElementHandler = blast_parser.endElement
632 expat_parser.CharacterDataHandler = blast_parser.characters
633
634 expat_parser.Parse(text, False)
635 while blast_parser._records:
636 record = blast_parser._records[0]
637 blast_parser._records = blast_parser._records[1:]
638 yield record
639
640 while True:
641
642 text, pending = pending + handle.read(BLOCK), ""
643 if not text:
644
645 expat_parser.Parse("", True)
646 break
647
648
649
650 pending = handle.read(MARGIN)
651
652 if ("\n" + XML_START) not in (text + pending):
653
654 expat_parser.Parse(text, False)
655 while blast_parser._records:
656 yield blast_parser._records.pop(0)
657 else:
658
659
660
661
662 text, pending = (text+pending).split("\n" + XML_START,1)
663 pending = XML_START + pending
664
665 expat_parser.Parse(text, True)
666 while blast_parser._records:
667 yield blast_parser._records.pop(0)
668
669
670
671 text, pending = pending, ""
672 break
673
674
675
676 while blast_parser._records:
677 yield blast_parser._records.pop(0)
678
679
680
681
682 assert pending==""
683 assert len(blast_parser._records) == 0
684
685
686 assert text==""
687 assert pending==""
688 assert len(blast_parser._records) == 0
689
690 if __name__ == '__main__':
691 import sys
692 handle = open(sys.argv[1])
693 r_list = parse(handle)
694
695 for r in r_list:
696
697 print 'Blast of', r.query
698 print 'Found %s alignments with a total of %s HSPs' % (len(r.alignments),
699 reduce(lambda a,b: a+b,
700 [len(a.hsps) for a in r.alignments]))
701
702 for al in r.alignments:
703 print al.title[:50], al.length, 'bp', len(al.hsps), 'HSPs'
704
705
706 E_VALUE_THRESH = 0.04
707 for alignment in r.alignments:
708 for hsp in alignment.hsps:
709 if hsp.expect < E_VALUE_THRESH:
710 print '*****'
711 print 'sequence', alignment.title
712 print 'length', alignment.length
713 print 'e value', hsp.expect
714 print hsp.query[:75] + '...'
715 print hsp.match[:75] + '...'
716 print hsp.sbjct[:75] + '...'
717