Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23  """ 
  24   
  25  from __future__ import print_function 
  26   
  27  from Bio import BiopythonDeprecationWarning 
  28  import warnings 
  29  warnings.warn("This module has been deprecated. Consider Bio.SearchIO for " 
  30                "parsing BLAST output instead.", BiopythonDeprecationWarning) 
  31   
  32  import os 
  33  import re 
  34  from Bio._py3k import StringIO 
  35   
  36  from Bio import File 
  37  from Bio.ParserSupport import * 
  38  from Bio.Blast import Record 
  39  from Bio.Application import _escape_filename 
  40   
  41  __docformat__ = "restructuredtext en" 
  42   
  43  _score_e_re = re.compile(r'Score +E') 
  44   
  45   
46 -class LowQualityBlastError(Exception):
47 """Error caused by running a low quality sequence through BLAST. 48 49 When low quality sequences (like GenBank entries containing only 50 stretches of a single nucleotide) are BLASTed, they will result in 51 BLAST generating an error and not being able to perform the BLAST. 52 search. This error should be raised for the BLAST reports produced 53 in this case. 54 """ 55 pass
56 57
58 -class ShortQueryBlastError(Exception):
59 """Error caused by running a short query sequence through BLAST. 60 61 If the query sequence is too short, BLAST outputs warnings and errors:: 62 63 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 64 [blastall] ERROR: [000.000] AT1G08320: Blast: 65 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 66 done 67 68 This exception is raised when that condition is detected. 69 """ 70 pass
71 72
73 -class _Scanner(object):
74 """Scan BLAST output from blastall or blastpgp. 75 76 Tested with blastall and blastpgp v2.0.10, v2.0.11 77 78 Methods: 79 - feed Feed data into the scanner. 80 """
81 - def feed(self, handle, consumer):
82 """S.feed(handle, consumer) 83 84 Feed in a BLAST report for scanning. handle is a file-like 85 object that contains the BLAST report. consumer is a Consumer 86 object that will receive events as the report is scanned. 87 """ 88 if isinstance(handle, File.UndoHandle): 89 uhandle = handle 90 else: 91 uhandle = File.UndoHandle(handle) 92 93 # Try to fast-forward to the beginning of the blast report. 94 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 95 # Now scan the BLAST report. 96 self._scan_header(uhandle, consumer) 97 self._scan_rounds(uhandle, consumer) 98 self._scan_database_report(uhandle, consumer) 99 self._scan_parameters(uhandle, consumer)
100
101 - def _scan_header(self, uhandle, consumer):
102 # BLASTP 2.0.10 [Aug-26-1999] 103 # 104 # 105 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 106 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 107 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 108 # programs", Nucleic Acids Res. 25:3389-3402. 109 # 110 # Query= test 111 # (140 letters) 112 # 113 # Database: sdqib40-1.35.seg.fa 114 # 1323 sequences; 223,339 total letters 115 # 116 # ======================================================== 117 # This next example is from the online version of Blast, 118 # note there are TWO references, an RID line, and also 119 # the database is BEFORE the query line. 120 # Note there possibleuse of non-ASCII in the author names. 121 # ======================================================== 122 # 123 # BLASTP 2.2.15 [Oct-15-2006] 124 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 125 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 126 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 127 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 128 # 129 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 130 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 131 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 132 # protein database searches with composition-based statistics 133 # and other refinements", Nucleic Acids Res. 29:2994-3005. 134 # 135 # RID: 1166022616-19998-65316425856.BLASTQ1 136 # 137 # 138 # Database: All non-redundant GenBank CDS 139 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 140 # 4,254,166 sequences; 1,462,033,012 total letters 141 # Query= gi:16127998 142 # Length=428 143 # 144 145 consumer.start_header() 146 147 read_and_call(uhandle, consumer.version, contains='BLAST') 148 read_and_call_while(uhandle, consumer.noevent, blank=1) 149 150 # There might be a <pre> line, for qblast output. 151 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 152 153 # Read the reference(s) 154 while attempt_read_and_call(uhandle, 155 consumer.reference, start='Reference'): 156 # References are normally multiline terminated by a blank line 157 # (or, based on the old code, the RID line) 158 while True: 159 line = uhandle.readline() 160 if is_blank_line(line): 161 consumer.noevent(line) 162 break 163 elif line.startswith("RID"): 164 break 165 else: 166 # More of the reference 167 consumer.reference(line) 168 169 # Deal with the optional RID: ... 170 read_and_call_while(uhandle, consumer.noevent, blank=1) 171 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 172 read_and_call_while(uhandle, consumer.noevent, blank=1) 173 174 # blastpgp may have a reference for compositional score matrix 175 # adjustment (see Bug 2502): 176 if attempt_read_and_call( 177 uhandle, consumer.reference, start="Reference"): 178 read_and_call_until(uhandle, consumer.reference, blank=1) 179 read_and_call_while(uhandle, consumer.noevent, blank=1) 180 181 # blastpgp has a Reference for composition-based statistics. 182 if attempt_read_and_call( 183 uhandle, consumer.reference, start="Reference"): 184 read_and_call_until(uhandle, consumer.reference, blank=1) 185 read_and_call_while(uhandle, consumer.noevent, blank=1) 186 187 line = uhandle.peekline() 188 assert line.strip() != "" 189 assert not line.startswith("RID:") 190 if line.startswith("Query="): 191 # This is an old style query then database... 192 193 # Read the Query lines and the following blank line. 194 read_and_call(uhandle, consumer.query_info, start='Query=') 195 read_and_call_until(uhandle, consumer.query_info, blank=1) 196 read_and_call_while(uhandle, consumer.noevent, blank=1) 197 198 # Read the database lines and the following blank line. 199 read_and_call_until(uhandle, consumer.database_info, end='total letters') 200 read_and_call(uhandle, consumer.database_info, contains='sequences') 201 read_and_call_while(uhandle, consumer.noevent, blank=1) 202 elif line.startswith("Database:"): 203 # This is a new style database then query... 204 read_and_call_until(uhandle, consumer.database_info, end='total letters') 205 read_and_call(uhandle, consumer.database_info, contains='sequences') 206 read_and_call_while(uhandle, consumer.noevent, blank=1) 207 208 # Read the Query lines and the following blank line. 209 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 210 # the "... Score E" line instead. 211 read_and_call(uhandle, consumer.query_info, start='Query=') 212 # BLAST 2.2.25+ has a blank line before Length= 213 read_and_call_until(uhandle, consumer.query_info, start='Length=') 214 while True: 215 line = uhandle.peekline() 216 if not line.strip() or _score_e_re.search(line) is not None: 217 break 218 # It is more of the query (and its length) 219 read_and_call(uhandle, consumer.query_info) 220 read_and_call_while(uhandle, consumer.noevent, blank=1) 221 else: 222 raise ValueError("Invalid header?") 223 224 consumer.end_header()
225
226 - def _scan_rounds(self, uhandle, consumer):
227 # Scan a bunch of rounds. 228 # Each round begins with either a "Searching......" line 229 # or a 'Score E' line followed by descriptions and alignments. 230 # The email server doesn't give the "Searching....." line. 231 # If there is no 'Searching.....' line then you'll first see a 232 # 'Results from round' line 233 234 while not self._eof(uhandle): 235 line = safe_peekline(uhandle) 236 if not line.startswith('Searching') and \ 237 not line.startswith('Results from round') and \ 238 _score_e_re.search(line) is None and \ 239 'No hits found' not in line: 240 break 241 self._scan_descriptions(uhandle, consumer) 242 self._scan_alignments(uhandle, consumer)
243
244 - def _scan_descriptions(self, uhandle, consumer):
245 # Searching..................................................done 246 # Results from round 2 247 # 248 # 249 # Sc 250 # Sequences producing significant alignments: (b 251 # Sequences used in model and found again: 252 # 253 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 254 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 255 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 256 # 257 # Sequences not found previously or not previously below threshold: 258 # 259 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 260 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 261 # 262 263 # If PSI-BLAST, may also have: 264 # 265 # CONVERGED! 266 267 consumer.start_descriptions() 268 269 # Read 'Searching' 270 # This line seems to be missing in BLASTN 2.1.2 (others?) 271 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 272 273 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 274 # If this happens, the handle will yield no more information. 275 if not uhandle.peekline(): 276 raise ValueError("Unexpected end of blast report. " + 277 "Looks suspiciously like a PSI-BLAST crash.") 278 279 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 280 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 281 # [blastall] ERROR: [000.000] AT1G08320: Blast: 282 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 283 # done 284 # Reported by David Weisman. 285 # Check for these error lines and ignore them for now. Let 286 # the BlastErrorParser deal with them. 287 line = uhandle.peekline() 288 if "ERROR:" in line or line.startswith("done"): 289 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 290 read_and_call(uhandle, consumer.noevent, start="done") 291 292 # Check to see if this is PSI-BLAST. 293 # If it is, the 'Searching' line will be followed by: 294 # (version 2.0.10) 295 # Searching............................. 296 # Results from round 2 297 # or (version 2.0.11) 298 # Searching............................. 299 # 300 # 301 # Results from round 2 302 303 # Skip a bunch of blank lines. 304 read_and_call_while(uhandle, consumer.noevent, blank=1) 305 # Check for the results line if it's there. 306 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 307 read_and_call_while(uhandle, consumer.noevent, blank=1) 308 309 # Three things can happen here: 310 # 1. line contains 'Score E' 311 # 2. line contains "No hits found" 312 # 3. no descriptions 313 # The first one begins a bunch of descriptions. The last two 314 # indicates that no descriptions follow, and we should go straight 315 # to the alignments. 316 if not attempt_read_and_call( 317 uhandle, consumer.description_header, 318 has_re=_score_e_re): 319 # Either case 2 or 3. Look for "No hits found". 320 attempt_read_and_call(uhandle, consumer.no_hits, 321 contains='No hits found') 322 try: 323 read_and_call_while(uhandle, consumer.noevent, blank=1) 324 except ValueError as err: 325 if str(err) != "Unexpected end of stream.": 326 raise err 327 328 consumer.end_descriptions() 329 # Stop processing. 330 return 331 332 # Read the score header lines 333 read_and_call(uhandle, consumer.description_header, 334 start='Sequences producing') 335 336 # If PSI-BLAST, read the 'Sequences used in model' line. 337 attempt_read_and_call(uhandle, consumer.model_sequences, 338 start='Sequences used in model') 339 read_and_call_while(uhandle, consumer.noevent, blank=1) 340 341 # In BLAT, rather than a "No hits found" line, we just 342 # get no descriptions (and no alignments). This can be 343 # spotted because the next line is the database block: 344 if safe_peekline(uhandle).startswith(" Database:"): 345 consumer.end_descriptions() 346 # Stop processing. 347 return 348 349 # Read the descriptions and the following blank lines, making 350 # sure that there are descriptions. 351 if not uhandle.peekline().startswith('Sequences not found'): 352 read_and_call_until(uhandle, consumer.description, blank=1) 353 read_and_call_while(uhandle, consumer.noevent, blank=1) 354 355 # If PSI-BLAST, read the 'Sequences not found' line followed 356 # by more descriptions. However, I need to watch out for the 357 # case where there were no sequences not found previously, in 358 # which case there will be no more descriptions. 359 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 360 start='Sequences not found'): 361 # Read the descriptions and the following blank lines. 362 read_and_call_while(uhandle, consumer.noevent, blank=1) 363 l = safe_peekline(uhandle) 364 # Brad -- added check for QUERY. On some PSI-BLAST outputs 365 # there will be a 'Sequences not found' line followed by no 366 # descriptions. Check for this case since the first thing you'll 367 # get is a blank line and then 'QUERY' 368 if not l.startswith('CONVERGED') and l[0] != '>' \ 369 and not l.startswith('QUERY'): 370 read_and_call_until(uhandle, consumer.description, blank=1) 371 read_and_call_while(uhandle, consumer.noevent, blank=1) 372 373 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 374 read_and_call_while(uhandle, consumer.noevent, blank=1) 375 376 consumer.end_descriptions()
377
378 - def _scan_alignments(self, uhandle, consumer):
379 if self._eof(uhandle): 380 return 381 382 # qblast inserts a helpful line here. 383 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 384 385 # First, check to see if I'm at the database report. 386 line = safe_peekline(uhandle) 387 if not line: 388 # EOF 389 return 390 elif line.startswith(' Database') or line.startswith("Lambda"): 391 return 392 elif line[0] == '>': 393 # XXX make a better check here between pairwise and masterslave 394 self._scan_pairwise_alignments(uhandle, consumer) 395 elif line.startswith('Effective'): 396 return 397 else: 398 # XXX put in a check to make sure I'm in a masterslave alignment 399 self._scan_masterslave_alignment(uhandle, consumer)
400
401 - def _scan_pairwise_alignments(self, uhandle, consumer):
402 while not self._eof(uhandle): 403 line = safe_peekline(uhandle) 404 if line[0] != '>': 405 break 406 self._scan_one_pairwise_alignment(uhandle, consumer)
407
408 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
409 if self._eof(uhandle): 410 return 411 consumer.start_alignment() 412 413 self._scan_alignment_header(uhandle, consumer) 414 415 # Scan a bunch of score/alignment pairs. 416 while True: 417 if self._eof(uhandle): 418 # Shouldn't have issued that _scan_alignment_header event... 419 break 420 line = safe_peekline(uhandle) 421 if not line.startswith(' Score'): 422 break 423 self._scan_hsp(uhandle, consumer) 424 consumer.end_alignment()
425
426 - def _scan_alignment_header(self, uhandle, consumer):
427 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 428 # stearothermophilus] 429 # Length = 81 430 # 431 # Or, more recently with different white space: 432 # 433 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 434 # gi|15829258|ref|NP_308031.1| threonine synthase 435 # ... 436 # Length=428 437 read_and_call(uhandle, consumer.title, start='>') 438 while True: 439 line = safe_readline(uhandle) 440 if line.lstrip().startswith('Length =') \ 441 or line.lstrip().startswith('Length='): 442 consumer.length(line) 443 break 444 elif is_blank_line(line): 445 # Check to make sure I haven't missed the Length line 446 raise ValueError("I missed the Length in an alignment header") 447 consumer.title(line) 448 449 # Older versions of BLAST will have a line with some spaces. 450 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 451 if not attempt_read_and_call(uhandle, consumer.noevent, 452 start=' '): 453 read_and_call(uhandle, consumer.noevent, blank=1)
454
455 - def _scan_hsp(self, uhandle, consumer):
456 consumer.start_hsp() 457 self._scan_hsp_header(uhandle, consumer) 458 self._scan_hsp_alignment(uhandle, consumer) 459 consumer.end_hsp()
460
461 - def _scan_hsp_header(self, uhandle, consumer):
462 # Score = 22.7 bits (47), Expect = 2.5 463 # Identities = 10/36 (27%), Positives = 18/36 (49%) 464 # Strand = Plus / Plus 465 # Frame = +3 466 # 467 468 read_and_call(uhandle, consumer.score, start=' Score') 469 read_and_call(uhandle, consumer.identities, start=' Identities') 470 # BLASTN 471 attempt_read_and_call(uhandle, consumer.strand, start=' Strand') 472 # BLASTX, TBLASTN, TBLASTX 473 attempt_read_and_call(uhandle, consumer.frame, start=' Frame') 474 read_and_call(uhandle, consumer.noevent, blank=1)
475
476 - def _scan_hsp_alignment(self, uhandle, consumer):
477 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 478 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 479 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 480 # 481 # Query: 64 AEKILIKR 71 482 # I +K 483 # Sbjct: 70 PNIIQLKD 77 484 # 485 486 while True: 487 # Blastn adds an extra line filled with spaces before Query 488 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 489 read_and_call(uhandle, consumer.query, start='Query') 490 read_and_call(uhandle, consumer.align, start=' ') 491 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 492 try: 493 read_and_call_while(uhandle, consumer.noevent, blank=1) 494 except ValueError as err: 495 if str(err) != "Unexpected end of stream.": 496 raise err 497 # End of File (well, it looks like it with recent versions 498 # of BLAST for multiple queries after the Iterator class 499 # has broken up the whole file into chunks). 500 break 501 line = safe_peekline(uhandle) 502 # Alignment continues if I see a 'Query' or the spaces for Blastn. 503 if not (line.startswith('Query') or line.startswith(' ')): 504 break
505
506 - def _scan_masterslave_alignment(self, uhandle, consumer):
507 consumer.start_alignment() 508 while True: 509 line = safe_readline(uhandle) 510 # Check to see whether I'm finished reading the alignment. 511 # This is indicated by 1) database section, 2) next psi-blast 512 # round, which can also be a 'Results from round' if no 513 # searching line is present 514 # patch by chapmanb 515 if line.startswith('Searching') or \ 516 line.startswith('Results from round'): 517 uhandle.saveline(line) 518 break 519 elif line.startswith(' Database'): 520 uhandle.saveline(line) 521 break 522 elif is_blank_line(line): 523 consumer.noevent(line) 524 else: 525 consumer.multalign(line) 526 read_and_call_while(uhandle, consumer.noevent, blank=1) 527 consumer.end_alignment()
528
529 - def _eof(self, uhandle):
530 try: 531 line = safe_peekline(uhandle) 532 except ValueError as err: 533 if str(err) != "Unexpected end of stream.": 534 raise err 535 line = "" 536 return not line
537
538 - def _scan_database_report(self, uhandle, consumer):
539 # Database: sdqib40-1.35.seg.fa 540 # Posted date: Nov 1, 1999 4:25 PM 541 # Number of letters in database: 223,339 542 # Number of sequences in database: 1323 543 # 544 # Lambda K H 545 # 0.322 0.133 0.369 546 # 547 # Gapped 548 # Lambda K H 549 # 0.270 0.0470 0.230 550 # 551 ########################################## 552 # Or, more recently Blast 2.2.15 gives less blank lines 553 ########################################## 554 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 555 # environmental samples 556 # Posted date: Dec 12, 2006 5:51 PM 557 # Number of letters in database: 667,088,753 558 # Number of sequences in database: 2,094,974 559 # Lambda K H 560 # 0.319 0.136 0.395 561 # Gapped 562 # Lambda K H 563 # 0.267 0.0410 0.140 564 565 if self._eof(uhandle): 566 return 567 568 consumer.start_database_report() 569 570 # Subset of the database(s) listed below 571 # Number of letters searched: 562,618,960 572 # Number of sequences searched: 228,924 573 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 574 read_and_call(uhandle, consumer.noevent, contains="letters") 575 read_and_call(uhandle, consumer.noevent, contains="sequences") 576 read_and_call(uhandle, consumer.noevent, start=" ") 577 578 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 579 # was missing the "Database" stanza completely. 580 while attempt_read_and_call(uhandle, consumer.database, 581 start=' Database'): 582 # BLAT output ends abruptly here, without any of the other 583 # information. Check to see if this is the case. If so, 584 # then end the database report here gracefully. 585 if not uhandle.peekline().strip() \ 586 or uhandle.peekline().startswith("BLAST"): 587 consumer.end_database_report() 588 return 589 590 # Database can span multiple lines. 591 read_and_call_until(uhandle, consumer.database, start=' Posted') 592 read_and_call(uhandle, consumer.posted_date, start=' Posted') 593 read_and_call(uhandle, consumer.num_letters_in_database, 594 start=' Number of letters') 595 read_and_call(uhandle, consumer.num_sequences_in_database, 596 start=' Number of sequences') 597 # There may not be a line starting with spaces... 598 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 599 600 line = safe_readline(uhandle) 601 uhandle.saveline(line) 602 if 'Lambda' in line: 603 break 604 605 try: 606 read_and_call(uhandle, consumer.noevent, start='Lambda') 607 read_and_call(uhandle, consumer.ka_params) 608 except: 609 pass 610 611 # This blank line is optional: 612 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 613 614 # not BLASTP 615 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 616 # not TBLASTX 617 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 618 read_and_call(uhandle, consumer.ka_params_gap) 619 620 # Blast 2.2.4 can sometimes skip the whole parameter section. 621 # Thus, I need to be careful not to read past the end of the 622 # file. 623 try: 624 read_and_call_while(uhandle, consumer.noevent, blank=1) 625 except ValueError as x: 626 if str(x) != "Unexpected end of stream.": 627 raise 628 consumer.end_database_report()
629
630 - def _scan_parameters(self, uhandle, consumer):
631 # Matrix: BLOSUM62 632 # Gap Penalties: Existence: 11, Extension: 1 633 # Number of Hits to DB: 50604 634 # Number of Sequences: 1323 635 # Number of extensions: 1526 636 # Number of successful extensions: 6 637 # Number of sequences better than 10.0: 5 638 # Number of HSP's better than 10.0 without gapping: 5 639 # Number of HSP's successfully gapped in prelim test: 0 640 # Number of HSP's that attempted gapping in prelim test: 1 641 # Number of HSP's gapped (non-prelim): 5 642 # length of query: 140 643 # length of database: 223,339 644 # effective HSP length: 39 645 # effective length of query: 101 646 # effective length of database: 171,742 647 # effective search space: 17345942 648 # effective search space used: 17345942 649 # T: 11 650 # A: 40 651 # X1: 16 ( 7.4 bits) 652 # X2: 38 (14.8 bits) 653 # X3: 64 (24.9 bits) 654 # S1: 41 (21.9 bits) 655 # S2: 42 (20.8 bits) 656 ########################################## 657 # Or, more recently Blast(x) 2.2.15 gives 658 ########################################## 659 # Matrix: BLOSUM62 660 # Gap Penalties: Existence: 11, Extension: 1 661 # Number of Sequences: 4535438 662 # Number of Hits to DB: 2,588,844,100 663 # Number of extensions: 60427286 664 # Number of successful extensions: 126433 665 # Number of sequences better than 2.0: 30 666 # Number of HSP's gapped: 126387 667 # Number of HSP's successfully gapped: 35 668 # Length of query: 291 669 # Length of database: 1,573,298,872 670 # Length adjustment: 130 671 # Effective length of query: 161 672 # Effective length of database: 983,691,932 673 # Effective search space: 158374401052 674 # Effective search space used: 158374401052 675 # Neighboring words threshold: 12 676 # Window for multiple hits: 40 677 # X1: 16 ( 7.3 bits) 678 # X2: 38 (14.6 bits) 679 # X3: 64 (24.7 bits) 680 # S1: 41 (21.7 bits) 681 # S2: 32 (16.9 bits) 682 683 # Blast 2.2.4 can sometimes skip the whole parameter section. 684 # BLAT also skips the whole parameter section. 685 # Thus, check to make sure that the parameter section really 686 # exists. 687 if not uhandle.peekline().strip(): 688 return 689 690 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 691 # "Number of Sequences" lines. 692 consumer.start_parameters() 693 694 # Matrix line may be missing in BLASTN 2.2.9 695 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 696 # not TBLASTX 697 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 698 699 attempt_read_and_call(uhandle, consumer.num_sequences, 700 start='Number of Sequences') 701 attempt_read_and_call(uhandle, consumer.num_hits, 702 start='Number of Hits') 703 attempt_read_and_call(uhandle, consumer.num_sequences, 704 start='Number of Sequences') 705 attempt_read_and_call(uhandle, consumer.num_extends, 706 start='Number of extensions') 707 attempt_read_and_call(uhandle, consumer.num_good_extends, 708 start='Number of successful') 709 710 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 711 start='Number of sequences') 712 713 # not BLASTN, TBLASTX 714 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 715 start="Number of HSP's better"): 716 # BLASTN 2.2.9 717 if attempt_read_and_call(uhandle, consumer.noevent, 718 start="Number of HSP's gapped:"): 719 read_and_call(uhandle, consumer.noevent, 720 start="Number of HSP's successfully") 721 # This is omitted in 2.2.15 722 attempt_read_and_call(uhandle, consumer.noevent, 723 start="Number of extra gapped extensions") 724 else: 725 read_and_call(uhandle, consumer.hsps_prelim_gapped, 726 start="Number of HSP's successfully") 727 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 728 start="Number of HSP's that") 729 read_and_call(uhandle, consumer.hsps_gapped, 730 start="Number of HSP's gapped") 731 # e.g. BLASTX 2.2.15 where the "better" line is missing 732 elif attempt_read_and_call(uhandle, consumer.noevent, 733 start="Number of HSP's gapped"): 734 read_and_call(uhandle, consumer.noevent, 735 start="Number of HSP's successfully") 736 737 # not in blastx 2.2.1 738 attempt_read_and_call(uhandle, consumer.query_length, 739 has_re=re.compile(r"[Ll]ength of query")) 740 # Not in BLASTX 2.2.22+ 741 attempt_read_and_call(uhandle, consumer.database_length, 742 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 743 744 # BLASTN 2.2.9 745 attempt_read_and_call(uhandle, consumer.noevent, 746 start="Length adjustment") 747 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 748 start='effective HSP') 749 # Not in blastx 2.2.1 750 attempt_read_and_call( 751 uhandle, consumer.effective_query_length, 752 has_re=re.compile(r'[Ee]ffective length of query')) 753 754 # This is not in BLASTP 2.2.15 755 attempt_read_and_call( 756 uhandle, consumer.effective_database_length, 757 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 758 # Not in blastx 2.2.1, added a ':' to distinguish between 759 # this and the 'effective search space used' line 760 attempt_read_and_call( 761 uhandle, consumer.effective_search_space, 762 has_re=re.compile(r'[Ee]ffective search space:')) 763 # Does not appear in BLASTP 2.0.5 764 attempt_read_and_call( 765 uhandle, consumer.effective_search_space_used, 766 has_re=re.compile(r'[Ee]ffective search space used')) 767 768 # BLASTX, TBLASTN, TBLASTX 769 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 770 771 # not in BLASTN 2.2.9 772 attempt_read_and_call(uhandle, consumer.threshold, start='T') 773 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 774 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 775 776 # not in BLASTX 2.2.15 777 attempt_read_and_call(uhandle, consumer.window_size, start='A') 778 # get this instead: "Window for multiple hits: 40" 779 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 780 781 # not in BLASTX 2.2.22+ 782 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 783 # not TBLASTN 784 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 785 786 # not BLASTN, TBLASTX 787 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 788 start='X3') 789 790 # not TBLASTN 791 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 792 # not in blastx 2.2.1 793 # first we make sure we have additional lines to work with, if 794 # not then the file is done and we don't have a final S2 795 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 796 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 797 798 consumer.end_parameters()
799 800
801 -class BlastParser(AbstractParser):
802 """Parses BLAST data into a Record.Blast object. 803 804 """
805 - def __init__(self):
806 """__init__(self)""" 807 self._scanner = _Scanner() 808 self._consumer = _BlastConsumer()
809
810 - def parse(self, handle):
811 """parse(self, handle)""" 812 self._scanner.feed(handle, self._consumer) 813 return self._consumer.data
814 815
816 -class PSIBlastParser(AbstractParser):
817 """Parses BLAST data into a Record.PSIBlast object. 818 819 """
820 - def __init__(self):
821 """__init__(self)""" 822 self._scanner = _Scanner() 823 self._consumer = _PSIBlastConsumer()
824
825 - def parse(self, handle):
826 """parse(self, handle)""" 827 self._scanner.feed(handle, self._consumer) 828 return self._consumer.data
829 830
831 -class _HeaderConsumer(object):
832 - def start_header(self):
833 self._header = Record.Header()
834
835 - def version(self, line):
836 c = line.split() 837 self._header.application = c[0] 838 self._header.version = c[1] 839 if len(c) > 2: 840 # The date is missing in the new C++ output from blastx 2.2.22+ 841 # Just get "BLASTX 2.2.22+\n" and that's all. 842 self._header.date = c[2][1:-1]
843
844 - def reference(self, line):
845 if line.startswith('Reference: '): 846 self._header.reference = line[11:] 847 else: 848 self._header.reference = self._header.reference + line
849
850 - def query_info(self, line):
851 if line.startswith('Query= '): 852 self._header.query = line[7:].lstrip() 853 elif line.startswith('Length='): 854 # New style way to give the query length in BLAST 2.2.22+ (the C++ code) 855 self._header.query_letters = _safe_int(line[7:].strip()) 856 elif not line.startswith(' '): # continuation of query_info 857 self._header.query = "%s%s" % (self._header.query, line) 858 else: 859 # Hope it is the old style way to give the query length: 860 letters, = _re_search( 861 r"([0-9,]+) letters", line, 862 "I could not find the number of letters in line\n%s" % line) 863 self._header.query_letters = _safe_int(letters)
864
865 - def database_info(self, line):
866 line = line.rstrip() 867 if line.startswith('Database: '): 868 self._header.database = line[10:] 869 elif not line.endswith('total letters'): 870 if self._header.database: 871 # Need to include a space when merging multi line datase descr 872 self._header.database = self._header.database + " " + line.strip() 873 else: 874 self._header.database = line.strip() 875 else: 876 sequences, letters = _re_search( 877 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 878 "I could not find the sequences and letters in line\n%s" % line) 879 self._header.database_sequences = _safe_int(sequences) 880 self._header.database_letters = _safe_int(letters)
881
882 - def end_header(self):
883 # Get rid of the trailing newlines 884 self._header.reference = self._header.reference.rstrip() 885 self._header.query = self._header.query.rstrip()
886 887
888 -class _DescriptionConsumer(object):
889 - def start_descriptions(self):
890 self._descriptions = [] 891 self._model_sequences = [] 892 self._nonmodel_sequences = [] 893 self._converged = 0 894 self._type = None 895 self._roundnum = None 896 897 self.__has_n = 0 # Does the description line contain an N value?
898
899 - def description_header(self, line):
900 if line.startswith('Sequences producing'): 901 cols = line.split() 902 if cols[-1] == 'N': 903 self.__has_n = 1
904
905 - def description(self, line):
906 dh = self._parse(line) 907 if self._type == 'model': 908 self._model_sequences.append(dh) 909 elif self._type == 'nonmodel': 910 self._nonmodel_sequences.append(dh) 911 else: 912 self._descriptions.append(dh)
913
914 - def model_sequences(self, line):
915 self._type = 'model'
916
917 - def nonmodel_sequences(self, line):
918 self._type = 'nonmodel'
919
920 - def converged(self, line):
921 self._converged = 1
922
923 - def no_hits(self, line):
924 pass
925
926 - def round(self, line):
927 if not line.startswith('Results from round'): 928 raise ValueError("I didn't understand the round line\n%s" % line) 929 self._roundnum = _safe_int(line[18:].strip())
930
931 - def end_descriptions(self):
932 pass
933
934 - def _parse(self, description_line):
935 line = description_line # for convenience 936 dh = Record.Description() 937 938 # I need to separate the score and p-value from the title. 939 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 940 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 941 # special cases to handle: 942 # - title must be preserved exactly (including whitespaces) 943 # - score could be equal to e-value (not likely, but what if??) 944 # - sometimes there's an "N" score of '1'. 945 cols = line.split() 946 if len(cols) < 3: 947 raise ValueError( 948 "Line does not appear to contain description:\n%s" % line) 949 if self.__has_n: 950 i = line.rfind(cols[-1]) # find start of N 951 i = line.rfind(cols[-2], 0, i) # find start of p-value 952 i = line.rfind(cols[-3], 0, i) # find start of score 953 else: 954 i = line.rfind(cols[-1]) # find start of p-value 955 i = line.rfind(cols[-2], 0, i) # find start of score 956 if self.__has_n: 957 dh.title, dh.score, dh.e, dh.num_alignments = \ 958 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 959 else: 960 dh.title, dh.score, dh.e, dh.num_alignments = \ 961 line[:i].rstrip(), cols[-2], cols[-1], 1 962 dh.num_alignments = _safe_int(dh.num_alignments) 963 dh.score = _safe_int(dh.score) 964 dh.e = _safe_float(dh.e) 965 return dh
966 967
968 -class _AlignmentConsumer(object):
969 # This is a little bit tricky. An alignment can either be a 970 # pairwise alignment or a multiple alignment. Since it's difficult 971 # to know a-priori which one the blast record will contain, I'm going 972 # to make one class that can parse both of them.
973 - def start_alignment(self):
974 self._alignment = Record.Alignment() 975 self._multiple_alignment = Record.MultipleAlignment()
976
977 - def title(self, line):
978 if self._alignment.title: 979 self._alignment.title += " " 980 self._alignment.title += line.strip()
981
982 - def length(self, line):
983 # e.g. "Length = 81" or more recently, "Length=428" 984 parts = line.replace(" ", "").split("=") 985 assert len(parts) == 2, "Unrecognised format length line" 986 self._alignment.length = parts[1] 987 self._alignment.length = _safe_int(self._alignment.length)
988
989 - def multalign(self, line):
990 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 991 if line.startswith('QUERY') or line.startswith('blast_tmp'): 992 # If this is the first line of the multiple alignment, 993 # then I need to figure out how the line is formatted. 994 995 # Format of line is: 996 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 997 try: 998 name, start, seq, end = line.split() 999 except ValueError: 1000 raise ValueError("I do not understand the line\n%s" % line) 1001 self._start_index = line.index(start, len(name)) 1002 self._seq_index = line.index(seq, 1003 self._start_index + len(start)) 1004 # subtract 1 for the space 1005 self._name_length = self._start_index - 1 1006 self._start_length = self._seq_index - self._start_index - 1 1007 self._seq_length = line.rfind(end) - self._seq_index - 1 1008 1009 # self._seq_index = line.index(seq) 1010 # # subtract 1 for the space 1011 # self._seq_length = line.rfind(end) - self._seq_index - 1 1012 # self._start_index = line.index(start) 1013 # self._start_length = self._seq_index - self._start_index - 1 1014 # self._name_length = self._start_index 1015 1016 # Extract the information from the line 1017 name = line[:self._name_length] 1018 name = name.rstrip() 1019 start = line[self._start_index:self._start_index + self._start_length] 1020 start = start.rstrip() 1021 if start: 1022 start = _safe_int(start) 1023 end = line[self._seq_index + self._seq_length:].rstrip() 1024 if end: 1025 end = _safe_int(end) 1026 seq = line[self._seq_index:self._seq_index + self._seq_length].rstrip() 1027 # right pad the sequence with spaces if necessary 1028 if len(seq) < self._seq_length: 1029 seq += ' ' * (self._seq_length - len(seq)) 1030 1031 # I need to make sure the sequence is aligned correctly with the query. 1032 # First, I will find the length of the query. Then, if necessary, 1033 # I will pad my current sequence with spaces so that they will line 1034 # up correctly. 1035 1036 # Two possible things can happen: 1037 # QUERY 1038 # 504 1039 # 1040 # QUERY 1041 # 403 1042 # 1043 # Sequence 504 will need padding at the end. Since I won't know 1044 # this until the end of the alignment, this will be handled in 1045 # end_alignment. 1046 # Sequence 403 will need padding before being added to the alignment. 1047 1048 align = self._multiple_alignment.alignment # for convenience 1049 align.append((name, start, seq, end))
1050 1051 # This is old code that tried to line up all the sequences 1052 # in a multiple alignment by using the sequence title's as 1053 # identifiers. The problem with this is that BLAST assigns 1054 # different HSP's from the same sequence the same id. Thus, 1055 # in one alignment block, there may be multiple sequences with 1056 # the same id. I'm not sure how to handle this, so I'm not 1057 # going to. 1058 1059 # # If the sequence is the query, then just add it. 1060 # if name == 'QUERY': 1061 # if len(align) == 0: 1062 # align.append((name, start, seq)) 1063 # else: 1064 # aname, astart, aseq = align[0] 1065 # if name != aname: 1066 # raise ValueError, "Query is not the first sequence" 1067 # aseq = aseq + seq 1068 # align[0] = aname, astart, aseq 1069 # else: 1070 # if len(align) == 0: 1071 # raise ValueError, "I could not find the query sequence" 1072 # qname, qstart, qseq = align[0] 1073 # 1074 # # Now find my sequence in the multiple alignment. 1075 # for i in range(1, len(align)): 1076 # aname, astart, aseq = align[i] 1077 # if name == aname: 1078 # index = i 1079 # break 1080 # else: 1081 # # If I couldn't find it, then add a new one. 1082 # align.append((None, None, None)) 1083 # index = len(align)-1 1084 # # Make sure to left-pad it. 1085 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1086 # 1087 # if len(qseq) != len(aseq) + len(seq): 1088 # # If my sequences are shorter than the query sequence, 1089 # # then I will need to pad some spaces to make them line up. 1090 # # Since I've already right padded seq, that means aseq 1091 # # must be too short. 1092 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1093 # aseq = aseq + seq 1094 # if astart is None: 1095 # astart = start 1096 # align[index] = aname, astart, aseq 1097
1098 - def end_alignment(self):
1099 # Remove trailing newlines 1100 if self._alignment: 1101 self._alignment.title = self._alignment.title.rstrip() 1102 1103 # This code is also obsolete. See note above. 1104 # If there's a multiple alignment, I will need to make sure 1105 # all the sequences are aligned. That is, I may need to 1106 # right-pad the sequences. 1107 # if self._multiple_alignment is not None: 1108 # align = self._multiple_alignment.alignment 1109 # seqlen = None 1110 # for i in range(len(align)): 1111 # name, start, seq = align[i] 1112 # if seqlen is None: 1113 # seqlen = len(seq) 1114 # else: 1115 # if len(seq) < seqlen: 1116 # seq = seq + ' '*(seqlen - len(seq)) 1117 # align[i] = name, start, seq 1118 # elif len(seq) > seqlen: 1119 # raise ValueError, \ 1120 # "Sequence %s is longer than the query" % name 1121 1122 # Clean up some variables, if they exist. 1123 try: 1124 del self._seq_index 1125 del self._seq_length 1126 del self._start_index 1127 del self._start_length 1128 del self._name_length 1129 except AttributeError: 1130 pass
1131 1132
1133 -class _HSPConsumer(object):
1134 - def start_hsp(self):
1135 self._hsp = Record.HSP()
1136
1137 - def score(self, line):
1138 self._hsp.bits, self._hsp.score = _re_search( 1139 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1140 "I could not find the score in line\n%s" % line) 1141 self._hsp.score = _safe_float(self._hsp.score) 1142 self._hsp.bits = _safe_float(self._hsp.bits) 1143 1144 x, y = _re_search( 1145 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1146 "I could not find the expect in line\n%s" % line) 1147 if x: 1148 self._hsp.num_alignments = _safe_int(x) 1149 else: 1150 self._hsp.num_alignments = 1 1151 self._hsp.expect = _safe_float(y)
1152
1153 - def identities(self, line):
1154 x, y = _re_search( 1155 r"Identities = (\d+)\/(\d+)", line, 1156 "I could not find the identities in line\n%s" % line) 1157 self._hsp.identities = _safe_int(x), _safe_int(y) 1158 self._hsp.align_length = _safe_int(y) 1159 1160 if 'Positives' in line: 1161 x, y = _re_search( 1162 r"Positives = (\d+)\/(\d+)", line, 1163 "I could not find the positives in line\n%s" % line) 1164 self._hsp.positives = _safe_int(x), _safe_int(y) 1165 assert self._hsp.align_length == _safe_int(y) 1166 1167 if 'Gaps' in line: 1168 x, y = _re_search( 1169 r"Gaps = (\d+)\/(\d+)", line, 1170 "I could not find the gaps in line\n%s" % line) 1171 self._hsp.gaps = _safe_int(x), _safe_int(y) 1172 assert self._hsp.align_length == _safe_int(y)
1173
1174 - def strand(self, line):
1175 self._hsp.strand = _re_search( 1176 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1177 "I could not find the strand in line\n%s" % line)
1178
1179 - def frame(self, line):
1180 # Frame can be in formats: 1181 # Frame = +1 1182 # Frame = +2 / +2 1183 if '/' in line: 1184 self._hsp.frame = _re_search( 1185 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1186 "I could not find the frame in line\n%s" % line) 1187 else: 1188 self._hsp.frame = _re_search( 1189 r"Frame = ([-+][123])", line, 1190 "I could not find the frame in line\n%s" % line)
1191 1192 # Match a space, if one is available. Masahir Ishikawa found a 1193 # case where there's no space between the start and the sequence: 1194 # Query: 100tt 101 1195 # line below modified by Yair Benita, Sep 2004 1196 # Note that the colon is not always present. 2006 1197 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1198
1199 - def query(self, line):
1200 m = self._query_re.search(line) 1201 if m is None: 1202 if line.strip() == "Query ------------------------------------------------------------": 1203 # Special case - long gap relative to the subject, 1204 # note there is no start/end present, cannot update those 1205 self._hsp.query += "-" * 60 1206 self._query_len = 60 # number of dashes 1207 self._query_start_index = 13 # offset of first dash 1208 return 1209 raise ValueError("I could not find the query in line\n%s" % line) 1210 1211 # line below modified by Yair Benita, Sep 2004. 1212 # added the end attribute for the query 1213 colon, start, seq, end = m.groups() 1214 seq = seq.strip() 1215 self._hsp.query += seq 1216 if self._hsp.query_start is None: 1217 self._hsp.query_start = _safe_int(start) 1218 1219 # line below added by Yair Benita, Sep 2004. 1220 # added the end attribute for the query 1221 self._hsp.query_end = _safe_int(end) 1222 1223 # Get index for sequence start (regular expression element 3) 1224 self._query_start_index = m.start(3) 1225 self._query_len = len(seq)
1226
1227 - def align(self, line):
1228 seq = line[self._query_start_index:].rstrip() 1229 if len(seq) < self._query_len: 1230 # Make sure the alignment is the same length as the query 1231 seq += ' ' * (self._query_len - len(seq)) 1232 elif len(seq) < self._query_len: 1233 raise ValueError("Match is longer than the query in line\n%s" 1234 % line) 1235 self._hsp.match = self._hsp.match + seq
1236 1237 # To match how we do the query, cache the regular expression. 1238 # Note that the colon is not always present. 1239 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1240
1241 - def sbjct(self, line):
1242 m = self._sbjct_re.search(line) 1243 if m is None: 1244 raise ValueError("I could not find the sbjct in line\n%s" % line) 1245 colon, start, seq, end = m.groups() 1246 # mikep 26/9/00 1247 # On occasion, there is a blast hit with no subject match 1248 # so far, it only occurs with 1-line short "matches" 1249 # I have decided to let these pass as they appear 1250 if not seq.strip(): 1251 seq = ' ' * self._query_len 1252 else: 1253 seq = seq.strip() 1254 self._hsp.sbjct += seq 1255 if self._hsp.sbjct_start is None: 1256 self._hsp.sbjct_start = _safe_int(start) 1257 1258 self._hsp.sbjct_end = _safe_int(end) 1259 if len(seq) != self._query_len: 1260 raise ValueError( 1261 "QUERY and SBJCT sequence lengths don't match (%i %r vs %i) in line\n%s" 1262 % (self._query_len, self._hsp.query, len(seq), line)) 1263 1264 del self._query_start_index # clean up unused variables 1265 del self._query_len
1266
1267 - def end_hsp(self):
1268 pass
1269 1270
1271 -class _DatabaseReportConsumer(object):
1272
1273 - def start_database_report(self):
1274 self._dr = Record.DatabaseReport()
1275
1276 - def database(self, line):
1277 m = re.search(r"Database: (.+)$", line) 1278 if m: 1279 self._dr.database_name.append(m.group(1)) 1280 elif self._dr.database_name: 1281 # This must be a continuation of the previous name. 1282 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1283 line.strip())
1284
1285 - def posted_date(self, line):
1286 self._dr.posted_date.append(_re_search( 1287 r"Posted date:\s*(.+)$", line, 1288 "I could not find the posted date in line\n%s" % line))
1289
1290 - def num_letters_in_database(self, line):
1291 letters, = _get_cols( 1292 line, (-1,), ncols=6, expected={2: "letters", 4: "database:"}) 1293 self._dr.num_letters_in_database.append(_safe_int(letters))
1294
1295 - def num_sequences_in_database(self, line):
1296 sequences, = _get_cols( 1297 line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"}) 1298 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1299
1300 - def ka_params(self, line):
1301 self._dr.ka_params = [_safe_float(x) for x in line.split()]
1302
1303 - def gapped(self, line):
1304 self._dr.gapped = 1
1305
1306 - def ka_params_gap(self, line):
1307 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1308
1309 - def end_database_report(self):
1310 pass
1311 1312
1313 -class _ParametersConsumer(object):
1314 - def start_parameters(self):
1315 self._params = Record.Parameters()
1316
1317 - def matrix(self, line):
1318 self._params.matrix = line[8:].rstrip()
1319
1320 - def gap_penalties(self, line):
1321 self._params.gap_penalties = [_safe_float(x) for x in _get_cols( 1322 line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"})]
1323
1324 - def num_hits(self, line):
1325 if '1st pass' in line: 1326 x, = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"}) 1327 self._params.num_hits = _safe_int(x) 1328 else: 1329 x, = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"}) 1330 self._params.num_hits = _safe_int(x)
1331
1332 - def num_sequences(self, line):
1333 if '1st pass' in line: 1334 x, = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"}) 1335 self._params.num_sequences = _safe_int(x) 1336 else: 1337 x, = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"}) 1338 self._params.num_sequences = _safe_int(x)
1339
1340 - def num_extends(self, line):
1341 if '1st pass' in line: 1342 x, = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"}) 1343 self._params.num_extends = _safe_int(x) 1344 else: 1345 x, = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"}) 1346 self._params.num_extends = _safe_int(x)
1347
1348 - def num_good_extends(self, line):
1349 if '1st pass' in line: 1350 x, = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"}) 1351 self._params.num_good_extends = _safe_int(x) 1352 else: 1353 x, = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"}) 1354 self._params.num_good_extends = _safe_int(x)
1355
1356 - def num_seqs_better_e(self, line):
1357 self._params.num_seqs_better_e, = _get_cols( 1358 line, (-1,), ncols=7, expected={2: "sequences"}) 1359 self._params.num_seqs_better_e = _safe_int( 1360 self._params.num_seqs_better_e)
1361
1362 - def hsps_no_gap(self, line):
1363 self._params.hsps_no_gap, = _get_cols( 1364 line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"}) 1365 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1366
1367 - def hsps_prelim_gapped(self, line):
1368 self._params.hsps_prelim_gapped, = _get_cols( 1369 line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"}) 1370 self._params.hsps_prelim_gapped = _safe_int( 1371 self._params.hsps_prelim_gapped)
1372
1373 - def hsps_prelim_gapped_attempted(self, line):
1374 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1375 line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"}) 1376 self._params.hsps_prelim_gapped_attempted = _safe_int( 1377 self._params.hsps_prelim_gapped_attempted)
1378
1379 - def hsps_gapped(self, line):
1380 self._params.hsps_gapped, = _get_cols( 1381 line, (-1,), ncols=6, expected={3: "gapped"}) 1382 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1383
1384 - def query_length(self, line):
1385 self._params.query_length, = _get_cols( 1386 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"}) 1387 self._params.query_length = _safe_int(self._params.query_length)
1388
1389 - def database_length(self, line):
1390 self._params.database_length, = _get_cols( 1391 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"}) 1392 self._params.database_length = _safe_int(self._params.database_length)
1393
1394 - def effective_hsp_length(self, line):
1395 self._params.effective_hsp_length, = _get_cols( 1396 line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"}) 1397 self._params.effective_hsp_length = _safe_int( 1398 self._params.effective_hsp_length)
1399
1400 - def effective_query_length(self, line):
1401 self._params.effective_query_length, = _get_cols( 1402 line, (-1,), ncols=5, expected={1: "length", 3: "query:"}) 1403 self._params.effective_query_length = _safe_int( 1404 self._params.effective_query_length)
1405
1406 - def effective_database_length(self, line):
1407 self._params.effective_database_length, = _get_cols( 1408 line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"}) 1409 self._params.effective_database_length = _safe_int( 1410 self._params.effective_database_length)
1411
1412 - def effective_search_space(self, line):
1413 self._params.effective_search_space, = _get_cols( 1414 line, (-1,), ncols=4, expected={1: "search"}) 1415 self._params.effective_search_space = _safe_int( 1416 self._params.effective_search_space)
1417
1418 - def effective_search_space_used(self, line):
1419 self._params.effective_search_space_used, = _get_cols( 1420 line, (-1,), ncols=5, expected={1: "search", 3: "used:"}) 1421 self._params.effective_search_space_used = _safe_int( 1422 self._params.effective_search_space_used)
1423
1424 - def frameshift(self, line):
1425 self._params.frameshift = _get_cols( 1426 line, (4, 5), ncols=6, expected={0: "frameshift", 2: "decay"})
1427
1428 - def threshold(self, line):
1429 if line[:2] == "T:": 1430 # Assume its an old stlye line like "T: 123" 1431 self._params.threshold, = _get_cols( 1432 line, (1,), ncols=2, expected={0: "T:"}) 1433 elif line[:28] == "Neighboring words threshold:": 1434 self._params.threshold, = _get_cols( 1435 line, (3,), ncols=4, expected={0: "Neighboring", 1: "words", 2: "threshold:"}) 1436 else: 1437 raise ValueError("Unrecognised threshold line:\n%s" % line) 1438 self._params.threshold = _safe_int(self._params.threshold)
1439
1440 - def window_size(self, line):
1441 if line[:2] == "A:": 1442 self._params.window_size, = _get_cols( 1443 line, (1,), ncols=2, expected={0: "A:"}) 1444 elif line[:25] == "Window for multiple hits:": 1445 self._params.window_size, = _get_cols( 1446 line, (4,), ncols=5, expected={0: "Window", 2: "multiple", 3: "hits:"}) 1447 else: 1448 raise ValueError("Unrecognised window size line:\n%s" % line) 1449 self._params.window_size = _safe_int(self._params.window_size)
1450
1451 - def dropoff_1st_pass(self, line):
1452 score, bits = _re_search( 1453 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1454 "I could not find the dropoff in line\n%s" % line) 1455 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1456
1457 - def gap_x_dropoff(self, line):
1458 score, bits = _re_search( 1459 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1460 "I could not find the gap dropoff in line\n%s" % line) 1461 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1462
1463 - def gap_x_dropoff_final(self, line):
1464 score, bits = _re_search( 1465 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1466 "I could not find the gap dropoff final in line\n%s" % line) 1467 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1468
1469 - def gap_trigger(self, line):
1470 score, bits = _re_search( 1471 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1472 "I could not find the gap trigger in line\n%s" % line) 1473 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1474
1475 - def blast_cutoff(self, line):
1476 score, bits = _re_search( 1477 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1478 "I could not find the blast cutoff in line\n%s" % line) 1479 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1480
1481 - def end_parameters(self):
1482 pass
1483 1484
1485 -class _BlastConsumer(AbstractConsumer, 1486 _HeaderConsumer, 1487 _DescriptionConsumer, 1488 _AlignmentConsumer, 1489 _HSPConsumer, 1490 _DatabaseReportConsumer, 1491 _ParametersConsumer 1492 ):
1493 # This Consumer is inherits from many other consumer classes that handle 1494 # the actual dirty work. An alternate way to do it is to create objects 1495 # of those classes and then delegate the parsing tasks to them in a 1496 # decorator-type pattern. The disadvantage of that is that the method 1497 # names will need to be resolved in this classes. However, using 1498 # a decorator will retain more control in this class (which may or 1499 # may not be a bad thing). In addition, having each sub-consumer as 1500 # its own object prevents this object's dictionary from being cluttered 1501 # with members and reduces the chance of member collisions.
1502 - def __init__(self):
1503 self.data = None
1504
1505 - def round(self, line):
1506 # Make sure nobody's trying to pass me PSI-BLAST data! 1507 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1508
1509 - def start_header(self):
1510 self.data = Record.Blast() 1511 _HeaderConsumer.start_header(self)
1512
1513 - def end_header(self):
1514 _HeaderConsumer.end_header(self) 1515 self.data.__dict__.update(self._header.__dict__)
1516
1517 - def end_descriptions(self):
1518 self.data.descriptions = self._descriptions
1519
1520 - def end_alignment(self):
1521 _AlignmentConsumer.end_alignment(self) 1522 if self._alignment.hsps: 1523 self.data.alignments.append(self._alignment) 1524 if self._multiple_alignment.alignment: 1525 self.data.multiple_alignment = self._multiple_alignment
1526
1527 - def end_hsp(self):
1528 _HSPConsumer.end_hsp(self) 1529 try: 1530 self._alignment.hsps.append(self._hsp) 1531 except AttributeError: 1532 raise ValueError("Found an HSP before an alignment")
1533
1534 - def end_database_report(self):
1535 _DatabaseReportConsumer.end_database_report(self) 1536 self.data.__dict__.update(self._dr.__dict__)
1537
1538 - def end_parameters(self):
1539 _ParametersConsumer.end_parameters(self) 1540 self.data.__dict__.update(self._params.__dict__)
1541 1542
1543 -class _PSIBlastConsumer(AbstractConsumer, 1544 _HeaderConsumer, 1545 _DescriptionConsumer, 1546 _AlignmentConsumer, 1547 _HSPConsumer, 1548 _DatabaseReportConsumer, 1549 _ParametersConsumer 1550 ):
1551 - def __init__(self):
1552 self.data = None
1553
1554 - def start_header(self):
1555 self.data = Record.PSIBlast() 1556 _HeaderConsumer.start_header(self)
1557
1558 - def end_header(self):
1559 _HeaderConsumer.end_header(self) 1560 self.data.__dict__.update(self._header.__dict__)
1561
1562 - def start_descriptions(self):
1563 self._round = Record.Round() 1564 self.data.rounds.append(self._round) 1565 _DescriptionConsumer.start_descriptions(self)
1566
1567 - def end_descriptions(self):
1568 _DescriptionConsumer.end_descriptions(self) 1569 self._round.number = self._roundnum 1570 if self._descriptions: 1571 self._round.new_seqs.extend(self._descriptions) 1572 self._round.reused_seqs.extend(self._model_sequences) 1573 self._round.new_seqs.extend(self._nonmodel_sequences) 1574 if self._converged: 1575 self.data.converged = 1
1576
1577 - def end_alignment(self):
1578 _AlignmentConsumer.end_alignment(self) 1579 if self._alignment.hsps: 1580 self._round.alignments.append(self._alignment) 1581 if self._multiple_alignment: 1582 self._round.multiple_alignment = self._multiple_alignment
1583
1584 - def end_hsp(self):
1585 _HSPConsumer.end_hsp(self) 1586 try: 1587 self._alignment.hsps.append(self._hsp) 1588 except AttributeError: 1589 raise ValueError("Found an HSP before an alignment")
1590
1591 - def end_database_report(self):
1592 _DatabaseReportConsumer.end_database_report(self) 1593 self.data.__dict__.update(self._dr.__dict__)
1594
1595 - def end_parameters(self):
1596 _ParametersConsumer.end_parameters(self) 1597 self.data.__dict__.update(self._params.__dict__)
1598 1599
1600 -class Iterator(object):
1601 """Iterates over a file of multiple BLAST results. 1602 1603 Methods: 1604 next Return the next record from the stream, or None. 1605 1606 """
1607 - def __init__(self, handle, parser=None):
1608 """__init__(self, handle, parser=None) 1609 1610 Create a new iterator. handle is a file-like object. parser 1611 is an optional Parser object to change the results into another form. 1612 If set to None, then the raw contents of the file will be returned. 1613 1614 """ 1615 try: 1616 handle.readline 1617 except AttributeError: 1618 raise ValueError( 1619 "I expected a file handle or file-like object, got %s" 1620 % type(handle)) 1621 self._uhandle = File.UndoHandle(handle) 1622 self._parser = parser 1623 self._header = []
1624
1625 - def __next__(self):
1626 """next(self) -> object 1627 1628 Return the next Blast record from the file. If no more records, 1629 return None. 1630 1631 """ 1632 lines = [] 1633 query = False 1634 while True: 1635 line = self._uhandle.readline() 1636 if not line: 1637 break 1638 # If I've reached the next one, then put the line back and stop. 1639 if lines and (line.startswith('BLAST') 1640 or line.startswith('BLAST', 1) 1641 or line.startswith('<?xml ')): 1642 self._uhandle.saveline(line) 1643 break 1644 # New style files omit the BLAST line to mark a new query: 1645 if line.startswith("Query="): 1646 if not query: 1647 if not self._header: 1648 self._header = lines[:] 1649 query = True 1650 else: 1651 # Start of another record 1652 self._uhandle.saveline(line) 1653 break 1654 lines.append(line) 1655 1656 if query and "BLAST" not in lines[0]: 1657 # Cheat and re-insert the header 1658 # print "-"*50 1659 # print "".join(self._header) 1660 # print "-"*50 1661 # print "".join(lines) 1662 # print "-"*50 1663 lines = self._header + lines 1664 1665 if not lines: 1666 return None 1667 1668 data = ''.join(lines) 1669 if self._parser is not None: 1670 return self._parser.parse(StringIO(data)) 1671 return data
1672 1673 if sys.version_info[0] < 3:
1674 - def next(self):
1675 """Python 2 style alias for Python 3 style __next__ method.""" 1676 return self.__next__()
1677
1678 - def __iter__(self):
1679 return iter(self.__next__, None)
1680 1681
1682 -def _re_search(regex, line, error_msg):
1683 m = re.search(regex, line) 1684 if not m: 1685 raise ValueError(error_msg) 1686 return m.groups()
1687 1688
1689 -def _get_cols(line, cols_to_get, ncols=None, expected=None):
1690 if expected is None: 1691 expected = {} 1692 cols = line.split() 1693 1694 # Check to make sure number of columns is correct 1695 if ncols is not None and len(cols) != ncols: 1696 raise ValueError("I expected %d columns (got %d) in line\n%s" 1697 % (ncols, len(cols), line)) 1698 1699 # Check to make sure columns contain the correct data 1700 for k in expected: 1701 if cols[k] != expected[k]: 1702 raise ValueError("I expected '%s' in column %d in line\n%s" 1703 % (expected[k], k, line)) 1704 1705 # Construct the answer tuple 1706 results = [] 1707 for c in cols_to_get: 1708 results.append(cols[c]) 1709 return tuple(results)
1710 1711
1712 -def _safe_int(str):
1713 try: 1714 return int(str) 1715 except ValueError: 1716 # Something went wrong. Try to clean up the string. 1717 # Remove all commas from the string 1718 str = str.replace(',', '') 1719 # try again after removing commas. 1720 # Note int() will return a long rather than overflow 1721 try: 1722 return int(str) 1723 except ValueError: 1724 pass 1725 # Call float to handle things like "54.3", note could lose precision, e.g. 1726 # >>> int("5399354557888517312") 1727 # 5399354557888517312 1728 # >>> int(float("5399354557888517312")) 1729 # 5399354557888517120 1730 return int(float(str))
1731 1732
1733 -def _safe_float(str):
1734 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1735 # float('e-172') does not produce an error on his platform. Thus, 1736 # we need to check the string for this condition. 1737 1738 # Sometimes BLAST leaves of the '1' in front of an exponent. 1739 if str and str[0] in ['E', 'e']: 1740 str = '1' + str 1741 try: 1742 return float(str) 1743 except ValueError: 1744 # Remove all commas from the string 1745 str = str.replace(',', '') 1746 # try again. 1747 return float(str)
1748 1749
1750 -class _BlastErrorConsumer(_BlastConsumer):
1751 - def __init__(self):
1753
1754 - def noevent(self, line):
1755 if 'Query must be at least wordsize' in line: 1756 raise ShortQueryBlastError("Query must be at least wordsize") 1757 # Now pass the line back up to the superclass. 1758 method = getattr(_BlastConsumer, 'noevent', 1759 _BlastConsumer.__getattr__(self, 'noevent')) 1760 method(line)
1761 1762
1763 -class BlastErrorParser(AbstractParser):
1764 """Attempt to catch and diagnose BLAST errors while parsing. 1765 1766 This utilizes the BlastParser module but adds an additional layer 1767 of complexity on top of it by attempting to diagnose ValueErrors 1768 that may actually indicate problems during BLAST parsing. 1769 1770 Current BLAST problems this detects are: 1771 o LowQualityBlastError - When BLASTing really low quality sequences 1772 (ie. some GenBank entries which are just short streches of a single 1773 nucleotide), BLAST will report an error with the sequence and be 1774 unable to search with this. This will lead to a badly formatted 1775 BLAST report that the parsers choke on. The parser will convert the 1776 ValueError to a LowQualityBlastError and attempt to provide useful 1777 information. 1778 """
1779 - def __init__(self, bad_report_handle=None):
1780 """Initialize a parser that tries to catch BlastErrors. 1781 1782 Arguments: 1783 o bad_report_handle - An optional argument specifying a handle 1784 where bad reports should be sent. This would allow you to save 1785 all of the bad reports to a file, for instance. If no handle 1786 is specified, the bad reports will not be saved. 1787 """ 1788 self._bad_report_handle = bad_report_handle 1789 1790 # self._b_parser = BlastParser() 1791 self._scanner = _Scanner() 1792 self._consumer = _BlastErrorConsumer()
1793
1794 - def parse(self, handle):
1795 """Parse a handle, attempting to diagnose errors. 1796 """ 1797 results = handle.read() 1798 1799 try: 1800 self._scanner.feed(StringIO(