Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23  """ 
  24   
  25  from __future__ import print_function 
  26   
  27  import sys 
  28  import re 
  29   
  30  from Bio._py3k import StringIO 
  31  from Bio.ParserSupport import AbstractParser, AbstractConsumer 
  32  from Bio.ParserSupport import read_and_call, read_and_call_until 
  33  from Bio.ParserSupport import read_and_call_while, attempt_read_and_call 
  34  from Bio.ParserSupport import is_blank_line, safe_peekline, safe_readline 
  35  from Bio import File 
  36  from Bio.Blast import Record 
  37   
  38  from Bio import BiopythonDeprecationWarning 
  39  import warnings 
  40  warnings.warn("This module has been deprecated. Consider Bio.SearchIO for " 
  41                "parsing BLAST output instead.", BiopythonDeprecationWarning) 
  42   
  43   
  44  _score_e_re = re.compile(r'Score +E') 
  45   
  46   
47 -class LowQualityBlastError(Exception):
48 """Error caused by running a low quality sequence through BLAST. 49 50 When low quality sequences (like GenBank entries containing only 51 stretches of a single nucleotide) are BLASTed, they will result in 52 BLAST generating an error and not being able to perform the BLAST. 53 search. This error should be raised for the BLAST reports produced 54 in this case. 55 """ 56 57 pass
58 59
60 -class ShortQueryBlastError(Exception):
61 """Error caused by running a short query sequence through BLAST. 62 63 If the query sequence is too short, BLAST outputs warnings and errors:: 64 65 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 66 [blastall] ERROR: [000.000] AT1G08320: Blast: 67 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 68 done 69 70 This exception is raised when that condition is detected. 71 """ 72 73 pass
74 75
76 -class _Scanner(object):
77 """Scan BLAST output from blastall or blastpgp. 78 79 Tested with blastall and blastpgp v2.0.10, v2.0.11 80 81 Methods: 82 - feed Feed data into the scanner. 83 84 """ 85
86 - def feed(self, handle, consumer):
87 """S.feed(handle, consumer) 88 89 Feed in a BLAST report for scanning. handle is a file-like 90 object that contains the BLAST report. consumer is a Consumer 91 object that will receive events as the report is scanned. 92 """ 93 if isinstance(handle, File.UndoHandle): 94 uhandle = handle 95 else: 96 uhandle = File.UndoHandle(handle) 97 98 # Try to fast-forward to the beginning of the blast report. 99 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 100 # Now scan the BLAST report. 101 self._scan_header(uhandle, consumer) 102 self._scan_rounds(uhandle, consumer) 103 self._scan_database_report(uhandle, consumer) 104 self._scan_parameters(uhandle, consumer)
105
106 - def _scan_header(self, uhandle, consumer):
107 # BLASTP 2.0.10 [Aug-26-1999] 108 # 109 # 110 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 111 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 112 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 113 # programs", Nucleic Acids Res. 25:3389-3402. 114 # 115 # Query= test 116 # (140 letters) 117 # 118 # Database: sdqib40-1.35.seg.fa 119 # 1323 sequences; 223,339 total letters 120 # 121 # ======================================================== 122 # This next example is from the online version of Blast, 123 # note there are TWO references, an RID line, and also 124 # the database is BEFORE the query line. 125 # Note there possibleuse of non-ASCII in the author names. 126 # ======================================================== 127 # 128 # BLASTP 2.2.15 [Oct-15-2006] 129 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 130 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 131 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 132 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 133 # 134 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 135 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 136 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 137 # protein database searches with composition-based statistics 138 # and other refinements", Nucleic Acids Res. 29:2994-3005. 139 # 140 # RID: 1166022616-19998-65316425856.BLASTQ1 141 # 142 # 143 # Database: All non-redundant GenBank CDS 144 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 145 # 4,254,166 sequences; 1,462,033,012 total letters 146 # Query= gi:16127998 147 # Length=428 148 # 149 150 consumer.start_header() 151 152 read_and_call(uhandle, consumer.version, contains='BLAST') 153 read_and_call_while(uhandle, consumer.noevent, blank=1) 154 155 # There might be a <pre> line, for qblast output. 156 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 157 158 # Read the reference(s) 159 while attempt_read_and_call(uhandle, 160 consumer.reference, start='Reference'): 161 # References are normally multiline terminated by a blank line 162 # (or, based on the old code, the RID line) 163 while True: 164 line = uhandle.readline() 165 if is_blank_line(line): 166 consumer.noevent(line) 167 break 168 elif line.startswith("RID"): 169 break 170 else: 171 # More of the reference 172 consumer.reference(line) 173 174 # Deal with the optional RID: ... 175 read_and_call_while(uhandle, consumer.noevent, blank=1) 176 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 177 read_and_call_while(uhandle, consumer.noevent, blank=1) 178 179 # blastpgp may have a reference for compositional score matrix 180 # adjustment (see Bug 2502): 181 if attempt_read_and_call( 182 uhandle, consumer.reference, start="Reference"): 183 read_and_call_until(uhandle, consumer.reference, blank=1) 184 read_and_call_while(uhandle, consumer.noevent, blank=1) 185 186 # blastpgp has a Reference for composition-based statistics. 187 if attempt_read_and_call( 188 uhandle, consumer.reference, start="Reference"): 189 read_and_call_until(uhandle, consumer.reference, blank=1) 190 read_and_call_while(uhandle, consumer.noevent, blank=1) 191 192 line = uhandle.peekline() 193 assert line.strip() != "" 194 assert not line.startswith("RID:") 195 if line.startswith("Query="): 196 # This is an old style query then database... 197 198 # Read the Query lines and the following blank line. 199 read_and_call(uhandle, consumer.query_info, start='Query=') 200 read_and_call_until(uhandle, consumer.query_info, blank=1) 201 read_and_call_while(uhandle, consumer.noevent, blank=1) 202 203 # Read the database lines and the following blank line. 204 read_and_call_until(uhandle, consumer.database_info, end='total letters') 205 read_and_call(uhandle, consumer.database_info, contains='sequences') 206 read_and_call_while(uhandle, consumer.noevent, blank=1) 207 elif line.startswith("Database:"): 208 # This is a new style database then query... 209 read_and_call_until(uhandle, consumer.database_info, end='total letters') 210 read_and_call(uhandle, consumer.database_info, contains='sequences') 211 read_and_call_while(uhandle, consumer.noevent, blank=1) 212 213 # Read the Query lines and the following blank line. 214 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 215 # the "... Score E" line instead. 216 read_and_call(uhandle, consumer.query_info, start='Query=') 217 # BLAST 2.2.25+ has a blank line before Length= 218 read_and_call_until(uhandle, consumer.query_info, start='Length=') 219 while True: 220 line = uhandle.peekline() 221 if not line.strip() or _score_e_re.search(line) is not None: 222 break 223 # It is more of the query (and its length) 224 read_and_call(uhandle, consumer.query_info) 225 read_and_call_while(uhandle, consumer.noevent, blank=1) 226 else: 227 raise ValueError("Invalid header?") 228 229 consumer.end_header()
230
231 - def _scan_rounds(self, uhandle, consumer):
232 # Scan a bunch of rounds. 233 # Each round begins with either a "Searching......" line 234 # or a 'Score E' line followed by descriptions and alignments. 235 # The email server doesn't give the "Searching....." line. 236 # If there is no 'Searching.....' line then you'll first see a 237 # 'Results from round' line 238 239 while not self._eof(uhandle): 240 line = safe_peekline(uhandle) 241 if not line.startswith('Searching') and \ 242 not line.startswith('Results from round') and \ 243 _score_e_re.search(line) is None and \ 244 'No hits found' not in line: 245 break 246 self._scan_descriptions(uhandle, consumer) 247 self._scan_alignments(uhandle, consumer)
248
249 - def _scan_descriptions(self, uhandle, consumer):
250 # Searching..................................................done 251 # Results from round 2 252 # 253 # 254 # Sc 255 # Sequences producing significant alignments: (b 256 # Sequences used in model and found again: 257 # 258 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 259 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 260 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 261 # 262 # Sequences not found previously or not previously below threshold: 263 # 264 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 265 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 266 # 267 268 # If PSI-BLAST, may also have: 269 # 270 # CONVERGED! 271 272 consumer.start_descriptions() 273 274 # Read 'Searching' 275 # This line seems to be missing in BLASTN 2.1.2 (others?) 276 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 277 278 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 279 # If this happens, the handle will yield no more information. 280 if not uhandle.peekline(): 281 raise ValueError("Unexpected end of blast report. " + 282 "Looks suspiciously like a PSI-BLAST crash.") 283 284 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 285 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 286 # [blastall] ERROR: [000.000] AT1G08320: Blast: 287 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 288 # done 289 # Reported by David Weisman. 290 # Check for these error lines and ignore them for now. Let 291 # the BlastErrorParser deal with them. 292 line = uhandle.peekline() 293 if "ERROR:" in line or line.startswith("done"): 294 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 295 read_and_call(uhandle, consumer.noevent, start="done") 296 297 # Check to see if this is PSI-BLAST. 298 # If it is, the 'Searching' line will be followed by: 299 # (version 2.0.10) 300 # Searching............................. 301 # Results from round 2 302 # or (version 2.0.11) 303 # Searching............................. 304 # 305 # 306 # Results from round 2 307 308 # Skip a bunch of blank lines. 309 read_and_call_while(uhandle, consumer.noevent, blank=1) 310 # Check for the results line if it's there. 311 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 312 read_and_call_while(uhandle, consumer.noevent, blank=1) 313 314 # Three things can happen here: 315 # 1. line contains 'Score E' 316 # 2. line contains "No hits found" 317 # 3. no descriptions 318 # The first one begins a bunch of descriptions. The last two 319 # indicates that no descriptions follow, and we should go straight 320 # to the alignments. 321 if not attempt_read_and_call( 322 uhandle, consumer.description_header, 323 has_re=_score_e_re): 324 # Either case 2 or 3. Look for "No hits found". 325 attempt_read_and_call(uhandle, consumer.no_hits, 326 contains='No hits found') 327 try: 328 read_and_call_while(uhandle, consumer.noevent, blank=1) 329 except ValueError as err: 330 if str(err) != "Unexpected end of stream.": 331 raise err 332 333 consumer.end_descriptions() 334 # Stop processing. 335 return 336 337 # Read the score header lines 338 read_and_call(uhandle, consumer.description_header, 339 start='Sequences producing') 340 341 # If PSI-BLAST, read the 'Sequences used in model' line. 342 attempt_read_and_call(uhandle, consumer.model_sequences, 343 start='Sequences used in model') 344 read_and_call_while(uhandle, consumer.noevent, blank=1) 345 346 # In BLAT, rather than a "No hits found" line, we just 347 # get no descriptions (and no alignments). This can be 348 # spotted because the next line is the database block: 349 if safe_peekline(uhandle).startswith(" Database:"): 350 consumer.end_descriptions() 351 # Stop processing. 352 return 353 354 # Read the descriptions and the following blank lines, making 355 # sure that there are descriptions. 356 if not uhandle.peekline().startswith('Sequences not found'): 357 read_and_call_until(uhandle, consumer.description, blank=1) 358 read_and_call_while(uhandle, consumer.noevent, blank=1) 359 360 # If PSI-BLAST, read the 'Sequences not found' line followed 361 # by more descriptions. However, I need to watch out for the 362 # case where there were no sequences not found previously, in 363 # which case there will be no more descriptions. 364 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 365 start='Sequences not found'): 366 # Read the descriptions and the following blank lines. 367 read_and_call_while(uhandle, consumer.noevent, blank=1) 368 l = safe_peekline(uhandle) 369 # Brad -- added check for QUERY. On some PSI-BLAST outputs 370 # there will be a 'Sequences not found' line followed by no 371 # descriptions. Check for this case since the first thing you'll 372 # get is a blank line and then 'QUERY' 373 if not l.startswith('CONVERGED') and l[0] != '>' \ 374 and not l.startswith('QUERY'): 375 read_and_call_until(uhandle, consumer.description, blank=1) 376 read_and_call_while(uhandle, consumer.noevent, blank=1) 377 378 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 379 read_and_call_while(uhandle, consumer.noevent, blank=1) 380 381 consumer.end_descriptions()
382
383 - def _scan_alignments(self, uhandle, consumer):
384 if self._eof(uhandle): 385 return 386 387 # qblast inserts a helpful line here. 388 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 389 390 # First, check to see if I'm at the database report. 391 line = safe_peekline(uhandle) 392 if not line: 393 # EOF 394 return 395 elif line.startswith(' Database') or line.startswith("Lambda"): 396 return 397 elif line[0] == '>': 398 # XXX make a better check here between pairwise and masterslave 399 self._scan_pairwise_alignments(uhandle, consumer) 400 elif line.startswith('Effective'): 401 return 402 else: 403 # XXX put in a check to make sure I'm in a masterslave alignment 404 self._scan_masterslave_alignment(uhandle, consumer)
405
406 - def _scan_pairwise_alignments(self, uhandle, consumer):
407 while not self._eof(uhandle): 408 line = safe_peekline(uhandle) 409 if line[0] != '>': 410 break 411 self._scan_one_pairwise_alignment(uhandle, consumer)
412
413 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
414 if self._eof(uhandle): 415 return 416 consumer.start_alignment() 417 418 self._scan_alignment_header(uhandle, consumer) 419 420 # Scan a bunch of score/alignment pairs. 421 while True: 422 if self._eof(uhandle): 423 # Shouldn't have issued that _scan_alignment_header event... 424 break 425 line = safe_peekline(uhandle) 426 if not line.startswith(' Score'): 427 break 428 self._scan_hsp(uhandle, consumer) 429 consumer.end_alignment()
430
431 - def _scan_alignment_header(self, uhandle, consumer):
432 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 433 # stearothermophilus] 434 # Length = 81 435 # 436 # Or, more recently with different white space: 437 # 438 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 439 # gi|15829258|ref|NP_308031.1| threonine synthase 440 # ... 441 # Length=428 442 read_and_call(uhandle, consumer.title, start='>') 443 while True: 444 line = safe_readline(uhandle) 445 if line.lstrip().startswith(('Length =', 'Length=')): 446 consumer.length(line) 447 break 448 elif is_blank_line(line): 449 # Check to make sure I haven't missed the Length line 450 raise ValueError("I missed the Length in an alignment header") 451 consumer.title(line) 452 453 # Older versions of BLAST will have a line with some spaces. 454 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 455 if not attempt_read_and_call(uhandle, consumer.noevent, 456 start=' '): 457 read_and_call(uhandle, consumer.noevent, blank=1)
458
459 - def _scan_hsp(self, uhandle, consumer):
460 consumer.start_hsp() 461 self._scan_hsp_header(uhandle, consumer) 462 self._scan_hsp_alignment(uhandle, consumer) 463 consumer.end_hsp()
464
465 - def _scan_hsp_header(self, uhandle, consumer):
466 # Score = 22.7 bits (47), Expect = 2.5 467 # Identities = 10/36 (27%), Positives = 18/36 (49%) 468 # Strand = Plus / Plus 469 # Frame = +3 470 # 471 472 read_and_call(uhandle, consumer.score, start=' Score') 473 read_and_call(uhandle, consumer.identities, start=' Identities') 474 # BLASTN 475 attempt_read_and_call(uhandle, consumer.strand, start=' Strand') 476 # BLASTX, TBLASTN, TBLASTX 477 attempt_read_and_call(uhandle, consumer.frame, start=' Frame') 478 read_and_call(uhandle, consumer.noevent, blank=1)
479
480 - def _scan_hsp_alignment(self, uhandle, consumer):
481 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 482 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 483 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 484 # 485 # Query: 64 AEKILIKR 71 486 # I +K 487 # Sbjct: 70 PNIIQLKD 77 488 # 489 490 while True: 491 # Blastn adds an extra line filled with spaces before Query 492 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 493 read_and_call(uhandle, consumer.query, start='Query') 494 read_and_call(uhandle, consumer.align, start=' ') 495 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 496 try: 497 read_and_call_while(uhandle, consumer.noevent, blank=1) 498 except ValueError as err: 499 if str(err) != "Unexpected end of stream.": 500 raise err 501 # End of File (well, it looks like it with recent versions 502 # of BLAST for multiple queries after the Iterator class 503 # has broken up the whole file into chunks). 504 break 505 line = safe_peekline(uhandle) 506 # Alignment continues if I see a 'Query' or the spaces for Blastn. 507 if not (line.startswith('Query') or line.startswith(' ')): 508 break
509
510 - def _scan_masterslave_alignment(self, uhandle, consumer):
511 consumer.start_alignment() 512 while True: 513 line = safe_readline(uhandle) 514 # Check to see whether I'm finished reading the alignment. 515 # This is indicated by 1) database section, 2) next psi-blast 516 # round, which can also be a 'Results from round' if no 517 # searching line is present 518 # patch by chapmanb 519 if line.startswith('Searching') or \ 520 line.startswith('Results from round'): 521 uhandle.saveline(line) 522 break 523 elif line.startswith(' Database'): 524 uhandle.saveline(line) 525 break 526 elif is_blank_line(line): 527 consumer.noevent(line) 528 else: 529 consumer.multalign(line) 530 read_and_call_while(uhandle, consumer.noevent, blank=1) 531 consumer.end_alignment()
532
533 - def _eof(self, uhandle):
534 try: 535 line = safe_peekline(uhandle) 536 except ValueError as err: 537 if str(err) != "Unexpected end of stream.": 538 raise err 539 line = "" 540 return not line
541
542 - def _scan_database_report(self, uhandle, consumer):
543 # Database: sdqib40-1.35.seg.fa 544 # Posted date: Nov 1, 1999 4:25 PM 545 # Number of letters in database: 223,339 546 # Number of sequences in database: 1323 547 # 548 # Lambda K H 549 # 0.322 0.133 0.369 550 # 551 # Gapped 552 # Lambda K H 553 # 0.270 0.0470 0.230 554 # 555 ########################################## 556 # Or, more recently Blast 2.2.15 gives less blank lines 557 ########################################## 558 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 559 # environmental samples 560 # Posted date: Dec 12, 2006 5:51 PM 561 # Number of letters in database: 667,088,753 562 # Number of sequences in database: 2,094,974 563 # Lambda K H 564 # 0.319 0.136 0.395 565 # Gapped 566 # Lambda K H 567 # 0.267 0.0410 0.140 568 569 if self._eof(uhandle): 570 return 571 572 consumer.start_database_report() 573 574 # Subset of the database(s) listed below 575 # Number of letters searched: 562,618,960 576 # Number of sequences searched: 228,924 577 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 578 read_and_call(uhandle, consumer.noevent, contains="letters") 579 read_and_call(uhandle, consumer.noevent, contains="sequences") 580 read_and_call(uhandle, consumer.noevent, start=" ") 581 582 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 583 # was missing the "Database" stanza completely. 584 while attempt_read_and_call(uhandle, consumer.database, 585 start=' Database'): 586 # BLAT output ends abruptly here, without any of the other 587 # information. Check to see if this is the case. If so, 588 # then end the database report here gracefully. 589 if not uhandle.peekline().strip() or \ 590 uhandle.peekline().startswith("BLAST"): 591 consumer.end_database_report() 592 return 593 594 # Database can span multiple lines. 595 read_and_call_until(uhandle, consumer.database, start=' Posted') 596 read_and_call(uhandle, consumer.posted_date, start=' Posted') 597 read_and_call(uhandle, consumer.num_letters_in_database, 598 start=' Number of letters') 599 read_and_call(uhandle, consumer.num_sequences_in_database, 600 start=' Number of sequences') 601 # There may not be a line starting with spaces... 602 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 603 604 line = safe_readline(uhandle) 605 uhandle.saveline(line) 606 if 'Lambda' in line: 607 break 608 609 try: 610 read_and_call(uhandle, consumer.noevent, start='Lambda') 611 read_and_call(uhandle, consumer.ka_params) 612 except Exception: # TODO: ValueError, AttributeError? 613 pass 614 615 # This blank line is optional: 616 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 617 618 # not BLASTP 619 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 620 # not TBLASTX 621 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 622 read_and_call(uhandle, consumer.ka_params_gap) 623 624 # Blast 2.2.4 can sometimes skip the whole parameter section. 625 # Thus, I need to be careful not to read past the end of the 626 # file. 627 try: 628 read_and_call_while(uhandle, consumer.noevent, blank=1) 629 except ValueError as x: 630 if str(x) != "Unexpected end of stream.": 631 raise 632 consumer.end_database_report()
633
634 - def _scan_parameters(self, uhandle, consumer):
635 # Matrix: BLOSUM62 636 # Gap Penalties: Existence: 11, Extension: 1 637 # Number of Hits to DB: 50604 638 # Number of Sequences: 1323 639 # Number of extensions: 1526 640 # Number of successful extensions: 6 641 # Number of sequences better than 10.0: 5 642 # Number of HSP's better than 10.0 without gapping: 5 643 # Number of HSP's successfully gapped in prelim test: 0 644 # Number of HSP's that attempted gapping in prelim test: 1 645 # Number of HSP's gapped (non-prelim): 5 646 # length of query: 140 647 # length of database: 223,339 648 # effective HSP length: 39 649 # effective length of query: 101 650 # effective length of database: 171,742 651 # effective search space: 17345942 652 # effective search space used: 17345942 653 # T: 11 654 # A: 40 655 # X1: 16 ( 7.4 bits) 656 # X2: 38 (14.8 bits) 657 # X3: 64 (24.9 bits) 658 # S1: 41 (21.9 bits) 659 # S2: 42 (20.8 bits) 660 ########################################## 661 # Or, more recently Blast(x) 2.2.15 gives 662 ########################################## 663 # Matrix: BLOSUM62 664 # Gap Penalties: Existence: 11, Extension: 1 665 # Number of Sequences: 4535438 666 # Number of Hits to DB: 2,588,844,100 667 # Number of extensions: 60427286 668 # Number of successful extensions: 126433 669 # Number of sequences better than 2.0: 30 670 # Number of HSP's gapped: 126387 671 # Number of HSP's successfully gapped: 35 672 # Length of query: 291 673 # Length of database: 1,573,298,872 674 # Length adjustment: 130 675 # Effective length of query: 161 676 # Effective length of database: 983,691,932 677 # Effective search space: 158374401052 678 # Effective search space used: 158374401052 679 # Neighboring words threshold: 12 680 # Window for multiple hits: 40 681 # X1: 16 ( 7.3 bits) 682 # X2: 38 (14.6 bits) 683 # X3: 64 (24.7 bits) 684 # S1: 41 (21.7 bits) 685 # S2: 32 (16.9 bits) 686 687 # Blast 2.2.4 can sometimes skip the whole parameter section. 688 # BLAT also skips the whole parameter section. 689 # Thus, check to make sure that the parameter section really 690 # exists. 691 if not uhandle.peekline().strip(): 692 return 693 694 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 695 # "Number of Sequences" lines. 696 consumer.start_parameters() 697 698 # Matrix line may be missing in BLASTN 2.2.9 699 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 700 # not TBLASTX 701 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 702 703 attempt_read_and_call(uhandle, consumer.num_sequences, 704 start='Number of Sequences') 705 attempt_read_and_call(uhandle, consumer.num_hits, 706 start='Number of Hits') 707 attempt_read_and_call(uhandle, consumer.num_sequences, 708 start='Number of Sequences') 709 attempt_read_and_call(uhandle, consumer.num_extends, 710 start='Number of extensions') 711 attempt_read_and_call(uhandle, consumer.num_good_extends, 712 start='Number of successful') 713 714 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 715 start='Number of sequences') 716 717 # not BLASTN, TBLASTX 718 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 719 start="Number of HSP's better"): 720 # BLASTN 2.2.9 721 if attempt_read_and_call(uhandle, consumer.noevent, 722 start="Number of HSP's gapped:"): 723 read_and_call(uhandle, consumer.noevent, 724 start="Number of HSP's successfully") 725 # This is omitted in 2.2.15 726 attempt_read_and_call(uhandle, consumer.noevent, 727 start="Number of extra gapped extensions") 728 else: 729 read_and_call(uhandle, consumer.hsps_prelim_gapped, 730 start="Number of HSP's successfully") 731 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 732 start="Number of HSP's that") 733 read_and_call(uhandle, consumer.hsps_gapped, 734 start="Number of HSP's gapped") 735 # e.g. BLASTX 2.2.15 where the "better" line is missing 736 elif attempt_read_and_call(uhandle, consumer.noevent, 737 start="Number of HSP's gapped"): 738 read_and_call(uhandle, consumer.noevent, 739 start="Number of HSP's successfully") 740 741 # not in blastx 2.2.1 742 attempt_read_and_call(uhandle, consumer.query_length, 743 has_re=re.compile(r"[Ll]ength of query")) 744 # Not in BLASTX 2.2.22+ 745 attempt_read_and_call(uhandle, consumer.database_length, 746 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 747 748 # BLASTN 2.2.9 749 attempt_read_and_call(uhandle, consumer.noevent, 750 start="Length adjustment") 751 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 752 start='effective HSP') 753 # Not in blastx 2.2.1 754 attempt_read_and_call( 755 uhandle, consumer.effective_query_length, 756 has_re=re.compile(r'[Ee]ffective length of query')) 757 758 # This is not in BLASTP 2.2.15 759 attempt_read_and_call( 760 uhandle, consumer.effective_database_length, 761 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 762 # Not in blastx 2.2.1, added a ':' to distinguish between 763 # this and the 'effective search space used' line 764 attempt_read_and_call( 765 uhandle, consumer.effective_search_space, 766 has_re=re.compile(r'[Ee]ffective search space:')) 767 # Does not appear in BLASTP 2.0.5 768 attempt_read_and_call( 769 uhandle, consumer.effective_search_space_used, 770 has_re=re.compile(r'[Ee]ffective search space used')) 771 772 # BLASTX, TBLASTN, TBLASTX 773 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 774 775 # not in BLASTN 2.2.9 776 attempt_read_and_call(uhandle, consumer.threshold, start='T') 777 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 778 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 779 780 # not in BLASTX 2.2.15 781 attempt_read_and_call(uhandle, consumer.window_size, start='A') 782 # get this instead: "Window for multiple hits: 40" 783 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 784 785 # not in BLASTX 2.2.22+ 786 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 787 # not TBLASTN 788 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 789 790 # not BLASTN, TBLASTX 791 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 792 start='X3') 793 794 # not TBLASTN 795 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 796 # not in blastx 2.2.1 797 # first we make sure we have additional lines to work with, if 798 # not then the file is done and we don't have a final S2 799 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 800 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 801 802 consumer.end_parameters()
803 804
805 -class BlastParser(AbstractParser):
806 """Parses BLAST data into a Record.Blast object.""" 807
808 - def __init__(self):
809 """__init__(self)""" 810 self._scanner = _Scanner() 811 self._consumer = _BlastConsumer()
812
813 - def parse(self, handle):
814 """parse(self, handle)""" 815 self._scanner.feed(handle, self._consumer) 816 return self._consumer.data
817 818
819 -class PSIBlastParser(AbstractParser):
820 """Parses BLAST data into a Record.PSIBlast object.""" 821
822 - def __init__(self):
823 """__init__(self)""" 824 self._scanner = _Scanner() 825 self._consumer = _PSIBlastConsumer()
826
827 - def parse(self, handle):
828 """parse(self, handle)""" 829 self._scanner.feed(handle, self._consumer) 830 return self._consumer.data
831 832
833 -class _HeaderConsumer(object):
834 - def start_header(self):
835 self._header = Record.Header()
836
837 - def version(self, line):
838 c = line.split() 839 self._header.application = c[0] 840 self._header.version = c[1] 841 if len(c) > 2: 842 # The date is missing in the new C++ output from blastx 2.2.22+ 843 # Just get "BLASTX 2.2.22+\n" and that's all. 844 self._header.date = c[2][1:-1]
845
846 - def reference(self, line):
847 if line.startswith('Reference: '): 848 self._header.reference = line[11:] 849 else: 850 self._header.reference = self._header.reference + line
851
852 - def query_info(self, line):
853 if line.startswith('Query= '): 854 self._header.query = line[7:].lstrip() 855 elif line.startswith('Length='): 856 # New style way to give the query length in BLAST 2.2.22+ (the C++ code) 857 self._header.query_letters = _safe_int(line[7:].strip()) 858 elif not line.startswith(' '): # continuation of query_info 859 self._header.query = "%s%s" % (self._header.query, line) 860 else: 861 # Hope it is the old style way to give the query length: 862 letters, = _re_search( 863 r"([0-9,]+) letters", line, 864 "I could not find the number of letters in line\n%s" % line) 865 self._header.query_letters = _safe_int(letters)
866
867 - def database_info(self, line):
868 line = line.rstrip() 869 if line.startswith('Database: '): 870 self._header.database = line[10:] 871 elif not line.endswith('total letters'): 872 if self._header.database: 873 # Need to include a space when merging multi line datase descr 874 self._header.database = self._header.database + " " + line.strip() 875 else: 876 self._header.database = line.strip() 877 else: 878 sequences, letters = _re_search( 879 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 880 "I could not find the sequences and letters in line\n%s" % line) 881 self._header.database_sequences = _safe_int(sequences) 882 self._header.database_letters = _safe_int(letters)
883
884 - def end_header(self):
885 # Get rid of the trailing newlines 886 self._header.reference = self._header.reference.rstrip() 887 self._header.query = self._header.query.rstrip()
888 889
890 -class _DescriptionConsumer(object):
891 - def start_descriptions(self):
892 self._descriptions = [] 893 self._model_sequences = [] 894 self._nonmodel_sequences = [] 895 self._converged = 0 896 self._type = None 897 self._roundnum = None 898 899 self.__has_n = 0 # Does the description line contain an N value?
900
901 - def description_header(self, line):
902 if line.startswith('Sequences producing'): 903 cols = line.split() 904 if cols[-1] == 'N': 905 self.__has_n = 1
906
907 - def description(self, line):
908 dh = self._parse(line) 909 if self._type == 'model': 910 self._model_sequences.append(dh) 911 elif self._type == 'nonmodel': 912 self._nonmodel_sequences.append(dh) 913 else: 914 self._descriptions.append(dh)
915
916 - def model_sequences(self, line):
917 self._type = 'model'
918
919 - def nonmodel_sequences(self, line):
920 self._type = 'nonmodel'
921
922 - def converged(self, line):
923 self._converged = 1
924
925 - def no_hits(self, line):
926 pass
927
928 - def round(self, line):
929 if not line.startswith('Results from round'): 930 raise ValueError("I didn't understand the round line\n%s" % line) 931 self._roundnum = _safe_int(line[18:].strip())
932
933 - def end_descriptions(self):
934 pass
935
936 - def _parse(self, description_line):
937 line = description_line # for convenience 938 dh = Record.Description() 939 940 # I need to separate the score and p-value from the title. 941 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 942 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 943 # special cases to handle: 944 # - title must be preserved exactly (including whitespaces) 945 # - score could be equal to e-value (not likely, but what if??) 946 # - sometimes there's an "N" score of '1'. 947 cols = line.split() 948 if len(cols) < 3: 949 raise ValueError( 950 "Line does not appear to contain description:\n%s" % line) 951 if self.__has_n: 952 i = line.rfind(cols[-1]) # find start of N 953 i = line.rfind(cols[-2], 0, i) # find start of p-value 954 i = line.rfind(cols[-3], 0, i) # find start of score 955 else: 956 i = line.rfind(cols[-1]) # find start of p-value 957 i = line.rfind(cols[-2], 0, i) # find start of score 958 if self.__has_n: 959 dh.title, dh.score, dh.e, dh.num_alignments = \ 960 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 961 else: 962 dh.title, dh.score, dh.e, dh.num_alignments = \ 963 line[:i].rstrip(), cols[-2], cols[-1], 1 964 dh.num_alignments = _safe_int(dh.num_alignments) 965 dh.score = _safe_int(dh.score) 966 dh.e = _safe_float(dh.e) 967 return dh
968 969
970 -class _AlignmentConsumer(object):
971 # This is a little bit tricky. An alignment can either be a 972 # pairwise alignment or a multiple alignment. Since it's difficult 973 # to know a-priori which one the blast record will contain, I'm going 974 # to make one class that can parse both of them.
975 - def start_alignment(self):
976 self._alignment = Record.Alignment() 977 self._multiple_alignment = Record.MultipleAlignment()
978
979 - def title(self, line):
980 if self._alignment.title: 981 self._alignment.title += " " 982 self._alignment.title += line.strip()
983
984 - def length(self, line):
985 # e.g. "Length = 81" or more recently, "Length=428" 986 parts = line.replace(" ", "").split("=") 987 assert len(parts) == 2, "Unrecognised format length line" 988 self._alignment.length = parts[1] 989 self._alignment.length = _safe_int(self._alignment.length)
990
991 - def multalign(self, line):
992 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 993 if line.startswith('QUERY') or line.startswith('blast_tmp'): 994 # If this is the first line of the multiple alignment, 995 # then I need to figure out how the line is formatted. 996 997 # Format of line is: 998 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 999 try: 1000 name, start, seq, end = line.split() 1001 except ValueError: 1002 raise ValueError("I do not understand the line\n%s" % line) 1003 self._start_index = line.index(start, len(name)) 1004 self._seq_index = line.index(seq, 1005 self._start_index + len(start)) 1006 # subtract 1 for the space 1007 self._name_length = self._start_index - 1 1008 self._start_length = self._seq_index - self._start_index - 1 1009 self._seq_length = line.rfind(end) - self._seq_index - 1 1010 1011 # self._seq_index = line.index(seq) 1012 # # subtract 1 for the space 1013 # self._seq_length = line.rfind(end) - self._seq_index - 1 1014 # self._start_index = line.index(start) 1015 # self._start_length = self._seq_index - self._start_index - 1 1016 # self._name_length = self._start_index 1017 1018 # Extract the information from the line 1019 name = line[:self._name_length] 1020 name = name.rstrip() 1021 start = line[self._start_index:self._start_index + self._start_length] 1022 start = start.rstrip() 1023 if start: 1024 start = _safe_int(start) 1025 end = line[self._seq_index + self._seq_length:].rstrip() 1026 if end: 1027 end = _safe_int(end) 1028 seq = line[self._seq_index:self._seq_index + self._seq_length].rstrip() 1029 # right pad the sequence with spaces if necessary 1030 if len(seq) < self._seq_length: 1031 seq += ' ' * (self._seq_length - len(seq)) 1032 1033 # I need to make sure the sequence is aligned correctly with the query. 1034 # First, I will find the length of the query. Then, if necessary, 1035 # I will pad my current sequence with spaces so that they will line 1036 # up correctly. 1037 1038 # Two possible things can happen: 1039 # QUERY 1040 # 504 1041 # 1042 # QUERY 1043 # 403 1044 # 1045 # Sequence 504 will need padding at the end. Since I won't know 1046 # this until the end of the alignment, this will be handled in 1047 # end_alignment. 1048 # Sequence 403 will need padding before being added to the alignment. 1049 1050 align = self._multiple_alignment.alignment # for convenience 1051 align.append((name, start, seq, end))
1052 1053 # This is old code that tried to line up all the sequences 1054 # in a multiple alignment by using the sequence title's as 1055 # identifiers. The problem with this is that BLAST assigns 1056 # different HSP's from the same sequence the same id. Thus, 1057 # in one alignment block, there may be multiple sequences with 1058 # the same id. I'm not sure how to handle this, so I'm not 1059 # going to. 1060 1061 # # If the sequence is the query, then just add it. 1062 # if name == 'QUERY': 1063 # if len(align) == 0: 1064 # align.append((name, start, seq)) 1065 # else: 1066 # aname, astart, aseq = align[0] 1067 # if name != aname: 1068 # raise ValueError, "Query is not the first sequence" 1069 # aseq = aseq + seq 1070 # align[0] = aname, astart, aseq 1071 # else: 1072 # if len(align) == 0: 1073 # raise ValueError, "I could not find the query sequence" 1074 # qname, qstart, qseq = align[0] 1075 # 1076 # # Now find my sequence in the multiple alignment. 1077 # for i in range(1, len(align)): 1078 # aname, astart, aseq = align[i] 1079 # if name == aname: 1080 # index = i 1081 # break 1082 # else: 1083 # # If I couldn't find it, then add a new one. 1084 # align.append((None, None, None)) 1085 # index = len(align)-1 1086 # # Make sure to left-pad it. 1087 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1088 # 1089 # if len(qseq) != len(aseq) + len(seq): 1090 # # If my sequences are shorter than the query sequence, 1091 # # then I will need to pad some spaces to make them line up. 1092 # # Since I've already right padded seq, that means aseq 1093 # # must be too short. 1094 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1095 # aseq = aseq + seq 1096 # if astart is None: 1097 # astart = start 1098 # align[index] = aname, astart, aseq 1099
1100 - def end_alignment(self):
1101 # Remove trailing newlines 1102 if self._alignment: 1103 self._alignment.title = self._alignment.title.rstrip() 1104 1105 # This code is also obsolete. See note above. 1106 # If there's a multiple alignment, I will need to make sure 1107 # all the sequences are aligned. That is, I may need to 1108 # right-pad the sequences. 1109 # if self._multiple_alignment is not None: 1110 # align = self._multiple_alignment.alignment 1111 # seqlen = None 1112 # for i in range(len(align)): 1113 # name, start, seq = align[i] 1114 # if seqlen is None: 1115 # seqlen = len(seq) 1116 # else: 1117 # if len(seq) < seqlen: 1118 # seq = seq + ' '*(seqlen - len(seq)) 1119 # align[i] = name, start, seq 1120 # elif len(seq) > seqlen: 1121 # raise ValueError, \ 1122 # "Sequence %s is longer than the query" % name 1123 1124 # Clean up some variables, if they exist. 1125 try: 1126 del self._seq_index 1127 del self._seq_length 1128 del self._start_index 1129 del self._start_length 1130 del self._name_length 1131 except AttributeError: 1132 pass
1133 1134
1135 -class _HSPConsumer(object):
1136 - def start_hsp(self):
1137 self._hsp = Record.HSP()
1138
1139 - def score(self, line):
1140 self._hsp.bits, self._hsp.score = _re_search( 1141 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1142 "I could not find the score in line\n%s" % line) 1143 self._hsp.score = _safe_float(self._hsp.score) 1144 self._hsp.bits = _safe_float(self._hsp.bits) 1145 1146 x, y = _re_search( 1147 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1148 "I could not find the expect in line\n%s" % line) 1149 if x: 1150 self._hsp.num_alignments = _safe_int(x) 1151 else: 1152 self._hsp.num_alignments = 1 1153 self._hsp.expect = _safe_float(y)
1154
1155 - def identities(self, line):
1156 x, y = _re_search( 1157 r"Identities = (\d+)\/(\d+)", line, 1158 "I could not find the identities in line\n%s" % line) 1159 self._hsp.identities = _safe_int(x), _safe_int(y) 1160 self._hsp.align_length = _safe_int(y) 1161 1162 if 'Positives' in line: 1163 x, y = _re_search( 1164 r"Positives = (\d+)\/(\d+)", line, 1165 "I could not find the positives in line\n%s" % line) 1166 self._hsp.positives = _safe_int(x), _safe_int(y) 1167 assert self._hsp.align_length == _safe_int(y) 1168 1169 if 'Gaps' in line: 1170 x, y = _re_search( 1171 r"Gaps = (\d+)\/(\d+)", line, 1172 "I could not find the gaps in line\n%s" % line) 1173 self._hsp.gaps = _safe_int(x), _safe_int(y) 1174 assert self._hsp.align_length == _safe_int(y)
1175
1176 - def strand(self, line):
1177 self._hsp.strand = _re_search( 1178 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1179 "I could not find the strand in line\n%s" % line)
1180
1181 - def frame(self, line):
1182 # Frame can be in formats: 1183 # Frame = +1 1184 # Frame = +2 / +2 1185 if '/' in line: 1186 self._hsp.frame = _re_search( 1187 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1188 "I could not find the frame in line\n%s" % line) 1189 else: 1190 self._hsp.frame = _re_search( 1191 r"Frame = ([-+][123])", line, 1192 "I could not find the frame in line\n%s" % line)
1193 1194 # Match a space, if one is available. Masahir Ishikawa found a 1195 # case where there's no space between the start and the sequence: 1196 # Query: 100tt 101 1197 # line below modified by Yair Benita, Sep 2004 1198 # Note that the colon is not always present. 2006 1199 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1200
1201 - def query(self, line):
1202 m = self._query_re.search(line) 1203 if m is None: 1204 if line.strip() == "Query ------------------------------------------------------------": 1205 # Special case - long gap relative to the subject, 1206 # note there is no start/end present, cannot update those 1207 self._hsp.query += "-" * 60 1208 self._query_len = 60 # number of dashes 1209 self._query_start_index = 13 # offset of first dash 1210 return 1211 raise ValueError("I could not find the query in line\n%s" % line) 1212 1213 # line below modified by Yair Benita, Sep 2004. 1214 # added the end attribute for the query 1215 colon, start, seq, end = m.groups() 1216 seq = seq.strip() 1217 self._hsp.query += seq 1218 if self._hsp.query_start is None: 1219 self._hsp.query_start = _safe_int(start) 1220 1221 # line below added by Yair Benita, Sep 2004. 1222 # added the end attribute for the query 1223 self._hsp.query_end = _safe_int(end) 1224 1225 # Get index for sequence start (regular expression element 3) 1226 self._query_start_index = m.start(3) 1227 self._query_len = len(seq)
1228
1229 - def align(self, line):
1230 seq = line[self._query_start_index:].rstrip() 1231 if len(seq) < self._query_len: 1232 # Make sure the alignment is the same length as the query 1233 seq += ' ' * (self._query_len - len(seq)) 1234 elif len(seq) < self._query_len: 1235 raise ValueError("Match is longer than the query in line\n%s" 1236 % line) 1237 self._hsp.match = self._hsp.match + seq
1238 1239 # To match how we do the query, cache the regular expression. 1240 # Note that the colon is not always present. 1241 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1242
1243 - def sbjct(self, line):
1244 m = self._sbjct_re.search(line) 1245 if m is None: 1246 raise ValueError("I could not find the sbjct in line\n%s" % line) 1247 colon, start, seq, end = m.groups() 1248 # mikep 26/9/00 1249 # On occasion, there is a blast hit with no subject match 1250 # so far, it only occurs with 1-line short "matches" 1251 # I have decided to let these pass as they appear 1252 if not seq.strip(): 1253 seq = ' ' * self._query_len 1254 else: 1255 seq = seq.strip() 1256 self._hsp.sbjct += seq 1257 if self._hsp.sbjct_start is None: 1258 self._hsp.sbjct_start = _safe_int(start) 1259 1260 self._hsp.sbjct_end = _safe_int(end) 1261 if len(seq) != self._query_len: 1262 raise ValueError( 1263 "QUERY and SBJCT sequence lengths don't match (%i %r vs %i) in line\n%s" 1264 % (self._query_len, self._hsp.query, len(seq), line)) 1265 1266 del self._query_start_index # clean up unused variables 1267 del self._query_len
1268
1269 - def end_hsp(self):
1270 pass
1271 1272
1273 -class _DatabaseReportConsumer(object):
1274
1275 - def start_database_report(self):
1276 self._dr = Record.DatabaseReport()
1277
1278 - def database(self, line):
1279 m = re.search(r"Database: (.+)$", line) 1280 if m: 1281 self._dr.database_name.append(m.group(1)) 1282 elif self._dr.database_name: 1283 # This must be a continuation of the previous name. 1284 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1285 line.strip())
1286
1287 - def posted_date(self, line):
1288 self._dr.posted_date.append(_re_search( 1289 r"Posted date:\s*(.+)$", line, 1290 "I could not find the posted date in line\n%s" % line))
1291
1292 - def num_letters_in_database(self, line):
1293 letters, = _get_cols( 1294 line, (-1,), ncols=6, expected={2: "letters", 4: "database:"}) 1295 self._dr.num_letters_in_database.append(_safe_int(letters))
1296
1297 - def num_sequences_in_database(self, line):
1298 sequences, = _get_cols( 1299 line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"}) 1300 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1301
1302 - def ka_params(self, line):
1303 self._dr.ka_params = [_safe_float(x) for x in line.split()]
1304
1305 - def gapped(self, line):
1306 self._dr.gapped = 1
1307
1308 - def ka_params_gap(self, line):
1309 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1310
1311 - def end_database_report(self):
1312 pass
1313 1314
1315 -class _ParametersConsumer(object):
1316 - def start_parameters(self):
1317 self._params = Record.Parameters()
1318
1319 - def matrix(self, line):
1320 self._params.matrix = line[8:].rstrip()
1321
1322 - def gap_penalties(self, line):
1323 self._params.gap_penalties = [_safe_float(x) for x in _get_cols( 1324 line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"})]
1325
1326 - def num_hits(self, line):
1327 if '1st pass' in line: 1328 x, = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"}) 1329 self._params.num_hits = _safe_int(x) 1330 else: 1331 x, = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"}) 1332 self._params.num_hits = _safe_int(x)
1333
1334 - def num_sequences(self, line):
1335 if '1st pass' in line: 1336 x, = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"}) 1337 self._params.num_sequences = _safe_int(x) 1338 else: 1339 x, = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"}) 1340 self._params.num_sequences = _safe_int(x)
1341
1342 - def num_extends(self, line):
1343 if '1st pass' in line: 1344 x, = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"}) 1345 self._params.num_extends = _safe_int(x) 1346 else: 1347 x, = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"}) 1348 self._params.num_extends = _safe_int(x)
1349
1350 - def num_good_extends(self, line):
1351 if '1st pass' in line: 1352 x, = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"}) 1353 self._params.num_good_extends = _safe_int(x) 1354 else: 1355 x, = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"}) 1356 self._params.num_good_extends = _safe_int(x)
1357
1358 - def num_seqs_better_e(self, line):
1359 self._params.num_seqs_better_e, = _get_cols( 1360 line, (-1,), ncols=7, expected={2: "sequences"}) 1361 self._params.num_seqs_better_e = _safe_int( 1362 self._params.num_seqs_better_e)
1363
1364 - def hsps_no_gap(self, line):
1365 self._params.hsps_no_gap, = _get_cols( 1366 line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"}) 1367 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1368
1369 - def hsps_prelim_gapped(self, line):
1370 self._params.hsps_prelim_gapped, = _get_cols( 1371 line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"}) 1372 self._params.hsps_prelim_gapped = _safe_int( 1373 self._params.hsps_prelim_gapped)
1374
1375 - def hsps_prelim_gapped_attempted(self, line):
1376 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1377 line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"}) 1378 self._params.hsps_prelim_gapped_attempted = _safe_int( 1379 self._params.hsps_prelim_gapped_attempted)
1380
1381 - def hsps_gapped(self, line):
1382 self._params.hsps_gapped, = _get_cols( 1383 line, (-1,), ncols=6, expected={3: "gapped"}) 1384 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1385
1386 - def query_length(self, line):
1387 self._params.query_length, = _get_cols( 1388 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"}) 1389 self._params.query_length = _safe_int(self._params.query_length)
1390
1391 - def database_length(self, line):
1392 self._params.database_length, = _get_cols( 1393 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"}) 1394 self._params.database_length = _safe_int(self._params.database_length)
1395
1396 - def effective_hsp_length(self, line):
1397 self._params.effective_hsp_length, = _get_cols( 1398 line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"}) 1399 self._params.effective_hsp_length = _safe_int( 1400 self._params.effective_hsp_length)
1401
1402 - def effective_query_length(self, line):
1403 self._params.effective_query_length, = _get_cols( 1404 line, (-1,), ncols=5, expected={1: "length", 3: "query:"}) 1405 self._params.effective_query_length = _safe_int( 1406 self._params.effective_query_length)
1407
1408 - def effective_database_length(self, line):
1409 self._params.effective_database_length, = _get_cols( 1410 line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"}) 1411 self._params.effective_database_length = _safe_int( 1412 self._params.effective_database_length)
1413
1414 - def effective_search_space(self, line):
1415 self._params.effective_search_space, = _get_cols( 1416 line, (-1,), ncols=4, expected={1: "search"}) 1417 self._params.effective_search_space = _safe_int( 1418 self._params.effective_search_space)
1419
1420 - def effective_search_space_used(self, line):
1421 self._params.effective_search_space_used, = _get_cols( 1422 line, (-1,), ncols=5, expected={1: "search", 3: "used:"}) 1423 self._params.effective_search_space_used = _safe_int( 1424 self._params.effective_search_space_used)
1425
1426 - def frameshift(self, line):
1427 self._params.frameshift = _get_cols(line, (4, 5), ncols=6, 1428 expected={0: "frameshift", 2: "decay"})
1429
1430 - def threshold(self, line):
1431 if line[:2] == "T:": 1432 # Assume its an old style line like "T: 123" 1433 self._params.threshold, = _get_cols(line, (1,), ncols=2, 1434 expected={0: "T:"}) 1435 elif line[:28] == "Neighboring words threshold:": 1436 self._params.threshold, = _get_cols(line, (3,), ncols=4, 1437 expected={0: "Neighboring", 1438 1: "words", 1439 2: "threshold:"}) 1440 else: 1441 raise ValueError("Unrecognised threshold line:\n%s" % line) 1442 self._params.threshold = _safe_int(self._params.threshold)
1443
1444 - def window_size(self, line):
1445 if line[:2] == "A:": 1446 self._params.window_size, = _get_cols(line, (1,), ncols=2, 1447 expected={0: "A:"}) 1448 elif line[:25] == "Window for multiple hits:": 1449 self._params.window_size, = _get_cols(line, (4,), ncols=5, 1450 expected={0: "Window", 1451 2: "multiple", 1452 3: "hits:"}) 1453 else: 1454 raise ValueError("Unrecognised window size line:\n%s" % line) 1455 self._params.window_size = _safe_int(self._params.window_size)
1456
1457 - def dropoff_1st_pass(self, line):
1458 score, bits = _re_search( 1459 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1460 "I could not find the dropoff in line\n%s" % line) 1461 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1462
1463 - def gap_x_dropoff(self, line):
1464 score, bits = _re_search( 1465 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1466 "I could not find the gap dropoff in line\n%s" % line) 1467 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1468
1469 - def gap_x_dropoff_final(self, line):
1470 score, bits = _re_search( 1471 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1472 "I could not find the gap dropoff final in line\n%s" % line) 1473 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1474
1475 - def gap_trigger(self, line):
1476 score, bits = _re_search( 1477 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1478 "I could not find the gap trigger in line\n%s" % line) 1479 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1480
1481 - def blast_cutoff(self, line):
1482 score, bits = _re_search( 1483 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1484 "I could not find the blast cutoff in line\n%s" % line) 1485 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1486
1487 - def end_parameters(self):
1488 pass
1489 1490
1491 -class _BlastConsumer(AbstractConsumer, 1492 _HeaderConsumer, 1493 _DescriptionConsumer, 1494 _AlignmentConsumer, 1495 _HSPConsumer, 1496 _DatabaseReportConsumer, 1497 _ParametersConsumer 1498 ):
1499 # This Consumer is inherits from many other consumer classes that handle 1500 # the actual dirty work. An alternate way to do it is to create objects 1501 # of those classes and then delegate the parsing tasks to them in a 1502 # decorator-type pattern. The disadvantage of that is that the method 1503 # names will need to be resolved in this classes. However, using 1504 # a decorator will retain more control in this class (which may or 1505 # may not be a bad thing). In addition, having each sub-consumer as 1506 # its own object prevents this object's dictionary from being cluttered 1507 # with members and reduces the chance of member collisions.
1508 - def __init__(self):
1509 self.data = None
1510
1511 - def round(self, line):
1512 # Make sure nobody's trying to pass me PSI-BLAST data! 1513 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1514
1515 - def start_header(self):
1516 self.data = Record.Blast() 1517 _HeaderConsumer.start_header(self)
1518
1519 - def end_header(self):
1520 _HeaderConsumer.end_header(self) 1521 self.data.__dict__.update(self._header.__dict__)
1522
1523 - def end_descriptions(self):
1524 self.data.descriptions = self._descriptions
1525
1526 - def end_alignment(self):
1527 _AlignmentConsumer.end_alignment(self) 1528 if self._alignment.hsps: 1529 self.data.alignments.append(self._alignment) 1530 if self._multiple_alignment.alignment: 1531 self.data.multiple_alignment = self._multiple_alignment
1532
1533 - def end_hsp(self):
1534 _HSPConsumer.end_hsp(self) 1535 try: 1536 self._alignment.hsps.append(self._hsp) 1537 except AttributeError: 1538 raise ValueError("Found an HSP before an alignment")
1539
1540 - def end_database_report(self):
1541 _DatabaseReportConsumer.end_database_report(self) 1542 self.data.__dict__.update(self._dr.__dict__)
1543
1544 - def end_parameters(self):
1545 _ParametersConsumer.end_parameters(self) 1546 self.data.__dict__.update(self._params.__dict__)
1547 1548
1549 -class _PSIBlastConsumer(AbstractConsumer, 1550 _HeaderConsumer, 1551 _DescriptionConsumer, 1552 _AlignmentConsumer, 1553 _HSPConsumer, 1554 _DatabaseReportConsumer, 1555 _ParametersConsumer 1556 ):
1557 - def __init__(self):
1558 self.data = None
1559
1560 - def start_header(self):
1561 self.data = Record.PSIBlast() 1562 _HeaderConsumer.start_header(self)
1563
1564 - def end_header(self):
1565 _HeaderConsumer.end_header(self) 1566 self.data.__dict__.update(self._header.__dict__)
1567
1568 - def start_descriptions(self):
1569 self._round = Record.Round() 1570 self.data.rounds.append(self._round) 1571 _DescriptionConsumer.start_descriptions(self)
1572
1573 - def end_descriptions(self):
1574 _DescriptionConsumer.end_descriptions(self) 1575 self._round.number = self._roundnum 1576 if self._descriptions: 1577 self._round.new_seqs.extend(self._descriptions) 1578 self._round.reused_seqs.extend(self._model_sequences) 1579 self._round.new_seqs.extend(self._nonmodel_sequences) 1580 if self._converged: 1581 self.data.converged = 1
1582
1583 - def end_alignment(self):
1584 _AlignmentConsumer.end_alignment(self) 1585 if self._alignment.hsps: 1586 self._round.alignments.append(self._alignment) 1587 if self._multiple_alignment: 1588 self._round.multiple_alignment = self._multiple_alignment
1589
1590 - def end_hsp(self):
1591 _HSPConsumer.end_hsp(self) 1592 try: 1593 self._alignment.hsps.append(self._hsp) 1594 except AttributeError: 1595 raise ValueError("Found an HSP before an alignment")
1596
1597 - def end_database_report(self):
1598 _DatabaseReportConsumer.end_database_report(self) 1599 self.data.__dict__.update(self._dr.__dict__)
1600
1601 - def end_parameters(self):
1602 _ParametersConsumer.end_parameters(self) 1603 self.data.__dict__.update(self._params.__dict__)
1604 1605
1606 -class Iterator(object):
1607 """Iterates over a file of multiple BLAST results. 1608 1609 Methods: 1610 next Return the next record from the stream, or None. 1611 1612 """ 1613
1614 - def __init__(self, handle, parser=None):
1615 """__init__(self, handle, parser=None) 1616 1617 Create a new iterator. handle is a file-like object. parser 1618 is an optional Parser object to change the results into another form. 1619 If set to None, then the raw contents of the file will be returned. 1620 1621 """ 1622 try: 1623 handle.readline 1624 except AttributeError: 1625 raise ValueError( 1626 "I expected a file handle or file-like object, got %s" 1627 % type(handle)) 1628 self._uhandle = File.UndoHandle(handle) 1629 self._parser = parser 1630 self._header = []
1631
1632 - def __next__(self):
1633 """next(self) -> object 1634 1635 Return the next Blast record from the file. If no more records, 1636 return None. 1637 """ 1638 lines = [] 1639 query = False 1640 while True: 1641 line = self._uhandle.readline() 1642 if not line: 1643 break 1644 # If I've reached the next one, then put the line back and stop. 1645 if lines and (line.startswith('BLAST') or 1646 line.startswith('BLAST', 1) or 1647 line.startswith('<?xml ')): 1648 self._uhandle.saveline(line) 1649 break 1650 # New style files omit the BLAST line to mark a new query: 1651 if line.startswith("Query="): 1652 if not query: 1653 if not self._header: 1654 self._header = lines[:] 1655 query = True 1656 else: 1657 # Start of another record 1658 self._uhandle.saveline(line) 1659 break 1660 lines.append(line) 1661 1662 if query and "BLAST" not in lines[0]: 1663 # Cheat and re-insert the header 1664 # print "-"*50 1665 # print "".join(self._header) 1666 # print "-"*50 1667 # print "".join(lines) 1668 # print "-"*50 1669 lines = self._header + lines 1670 1671 if not lines: 1672 return None 1673 1674 data = ''.join(lines) 1675 if self._parser is not None: 1676 return self._parser.parse(StringIO(data)) 1677 return data
1678 1679 if sys.version_info[0] < 3:
1680 - def next(self):
1681 """Python 2 style alias for Python 3 style __next__ method.""" 1682 return self.__next__()
1683
1684 - def __iter__(self):
1685 return iter(self.__next__, None)
1686 1687
1688 -def _re_search(regex, line, error_msg):
1689 m = re.search(regex, line) 1690 if not m: 1691 raise ValueError(error_msg) 1692 return m.groups()
1693 1694
1695 -def _get_cols(line, cols_to_get, ncols=None, expected=None):
1696 if expected is None: 1697 expected = {} 1698 cols = line.split() 1699 1700 # Check to make sure number of columns is correct 1701 if ncols is not None and len(cols) != ncols: 1702 raise ValueError("I expected %d columns (got %d) in line\n%s" 1703 % (ncols, len(cols), line)) 1704 1705 # Check to make sure columns contain the correct data 1706 for k in expected: 1707 if cols[k] != expected[k]: 1708 raise ValueError("I expected '%s' in column %d in line\n%s" 1709 % (expected[k], k, line)) 1710 1711 # Construct the answer tuple 1712 results = [] 1713 for c in cols_to_get: 1714 results.append(cols[c]) 1715 return tuple(results)
1716 1717
1718 -def _safe_int(str):
1719 try: 1720 return int(str) 1721 except ValueError: 1722 # Something went wrong. Try to clean up the string. 1723 # Remove all commas from the string 1724 str = str.replace(',', '') 1725 # try again after removing commas. 1726 # Note int() will return a long rather than overflow 1727 try: 1728 return int(str) 1729 except ValueError: 1730 pass 1731 # Call float to handle things like "54.3", note could lose precision, e.g. 1732 # >>> int("5399354557888517312") 1733 # 5399354557888517312 1734 # >>> int(float("5399354557888517312")) 1735 # 5399354557888517120 1736 return int(float(str))
1737 1738
1739 -def _safe_float(str):
1740 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1741 # float('e-172') does not produce an error on his platform. Thus, 1742 # we need to check the string for this condition. 1743 1744 # Sometimes BLAST leaves of the '1' in front of an exponent. 1745 if str and str[0] in ['E', 'e']: 1746 str = '1' + str 1747 try: 1748 return float(str) 1749 except ValueError: 1750 # Remove all commas from the string 1751 str = str.replace(',', '') 1752 # try again. 1753 return float(str)
1754 1755
1756 -class _BlastErrorConsumer(_BlastConsumer):
1757 - def __init__(self):
1759
1760 - def noevent(self, line):
1761 if 'Query must be at least wordsize' in line: 1762 raise ShortQueryBlastError("Query must be at least wordsize") 1763 # Now pass the line back up to the superclass. 1764 method = getattr(_BlastConsumer, 'noevent', 1765 _BlastConsumer.__getattr__(self, 'noevent')) 1766 method(line)
1767 1768
1769 -class BlastErrorParser(AbstractParser):
1770 """Attempt to catch and diagnose BLAST errors while parsing. 1771 1772 This utilizes the BlastParser module but adds an additional layer 1773 of complexity on top of it by attempting to diagnose ValueErrors 1774 that may actually indicate problems during BLAST parsing. 1775 1776 Current BLAST problems this detects are: 1777 - LowQualityBlastError - When BLASTing really low quality sequences 1778 (ie. some GenBank entries which are just short stretches of a single 1779 nucleotide), BLAST will report an error with the sequence and be 1780 unable to search with this. This will lead to a badly formatted 1781 BLAST report that the parsers choke on. The parser will convert the 1782 ValueError to a LowQualityBlastError and attempt to provide useful 1783 information. 1784 """ 1785
1786 - def __init__(self, bad_report_handle=None):
1787 """Initialize a parser that tries to catch BlastErrors. 1788 1789 Arguments: 1790 - bad_report_handle - An optional argument specifying a handle 1791 where bad reports should be sent. This would allow you to save 1792 all of the bad reports to a file, for instance. If no handle 1793 is specified, the bad reports will not be saved. 1794 """ 1795 self._bad_report_handle = bad_report_handle 1796 1797 # self._b_parser = BlastParser() 1798 self._scanner = _Scanner() 1799 self._consumer = _BlastErrorConsumer()
1800
1801 - def parse(self, handle):
1802 """Parse a handle, attempting to diagnose errors.""" 1803