Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23  """ 
  24   
  25  from __future__ import print_function 
  26   
  27  from Bio import BiopythonDeprecationWarning 
  28  import warnings 
  29  warnings.warn("This module has been deprecated. Consider Bio.SearchIO for " 
  30                "parsing BLAST output instead.", BiopythonDeprecationWarning) 
  31   
  32  import os 
  33  import re 
  34  from Bio._py3k import StringIO 
  35   
  36  from Bio import File 
  37  from Bio.ParserSupport import * 
  38  from Bio.Blast import Record 
  39  from Bio.Application import _escape_filename 
  40   
  41  __docformat__ = "restructuredtext en" 
  42   
  43   
44 -class LowQualityBlastError(Exception):
45 """Error caused by running a low quality sequence through BLAST. 46 47 When low quality sequences (like GenBank entries containing only 48 stretches of a single nucleotide) are BLASTed, they will result in 49 BLAST generating an error and not being able to perform the BLAST. 50 search. This error should be raised for the BLAST reports produced 51 in this case. 52 """ 53 pass
54 55
56 -class ShortQueryBlastError(Exception):
57 """Error caused by running a short query sequence through BLAST. 58 59 If the query sequence is too short, BLAST outputs warnings and errors:: 60 61 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 62 [blastall] ERROR: [000.000] AT1G08320: Blast: 63 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 64 done 65 66 This exception is raised when that condition is detected. 67 """ 68 pass
69 70
71 -class _Scanner(object):
72 """Scan BLAST output from blastall or blastpgp. 73 74 Tested with blastall and blastpgp v2.0.10, v2.0.11 75 76 Methods: 77 - feed Feed data into the scanner. 78 """
79 - def feed(self, handle, consumer):
80 """S.feed(handle, consumer) 81 82 Feed in a BLAST report for scanning. handle is a file-like 83 object that contains the BLAST report. consumer is a Consumer 84 object that will receive events as the report is scanned. 85 """ 86 if isinstance(handle, File.UndoHandle): 87 uhandle = handle 88 else: 89 uhandle = File.UndoHandle(handle) 90 91 # Try to fast-forward to the beginning of the blast report. 92 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 93 # Now scan the BLAST report. 94 self._scan_header(uhandle, consumer) 95 self._scan_rounds(uhandle, consumer) 96 self._scan_database_report(uhandle, consumer) 97 self._scan_parameters(uhandle, consumer)
98
99 - def _scan_header(self, uhandle, consumer):
100 # BLASTP 2.0.10 [Aug-26-1999] 101 # 102 # 103 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 104 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 105 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 106 # programs", Nucleic Acids Res. 25:3389-3402. 107 # 108 # Query= test 109 # (140 letters) 110 # 111 # Database: sdqib40-1.35.seg.fa 112 # 1323 sequences; 223,339 total letters 113 # 114 # ======================================================== 115 # This next example is from the online version of Blast, 116 # note there are TWO references, an RID line, and also 117 # the database is BEFORE the query line. 118 # Note there possibleuse of non-ASCII in the author names. 119 # ======================================================== 120 # 121 # BLASTP 2.2.15 [Oct-15-2006] 122 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 123 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 124 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 125 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 126 # 127 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 128 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 129 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 130 # protein database searches with composition-based statistics 131 # and other refinements", Nucleic Acids Res. 29:2994-3005. 132 # 133 # RID: 1166022616-19998-65316425856.BLASTQ1 134 # 135 # 136 # Database: All non-redundant GenBank CDS 137 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 138 # 4,254,166 sequences; 1,462,033,012 total letters 139 # Query= gi:16127998 140 # Length=428 141 # 142 143 consumer.start_header() 144 145 read_and_call(uhandle, consumer.version, contains='BLAST') 146 read_and_call_while(uhandle, consumer.noevent, blank=1) 147 148 # There might be a <pre> line, for qblast output. 149 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 150 151 # Read the reference(s) 152 while attempt_read_and_call(uhandle, 153 consumer.reference, start='Reference'): 154 # References are normally multiline terminated by a blank line 155 # (or, based on the old code, the RID line) 156 while True: 157 line = uhandle.readline() 158 if is_blank_line(line): 159 consumer.noevent(line) 160 break 161 elif line.startswith("RID"): 162 break 163 else: 164 # More of the reference 165 consumer.reference(line) 166 167 # Deal with the optional RID: ... 168 read_and_call_while(uhandle, consumer.noevent, blank=1) 169 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 170 read_and_call_while(uhandle, consumer.noevent, blank=1) 171 172 # blastpgp may have a reference for compositional score matrix 173 # adjustment (see Bug 2502): 174 if attempt_read_and_call( 175 uhandle, consumer.reference, start="Reference"): 176 read_and_call_until(uhandle, consumer.reference, blank=1) 177 read_and_call_while(uhandle, consumer.noevent, blank=1) 178 179 # blastpgp has a Reference for composition-based statistics. 180 if attempt_read_and_call( 181 uhandle, consumer.reference, start="Reference"): 182 read_and_call_until(uhandle, consumer.reference, blank=1) 183 read_and_call_while(uhandle, consumer.noevent, blank=1) 184 185 line = uhandle.peekline() 186 assert line.strip() != "" 187 assert not line.startswith("RID:") 188 if line.startswith("Query="): 189 # This is an old style query then database... 190 191 # Read the Query lines and the following blank line. 192 read_and_call(uhandle, consumer.query_info, start='Query=') 193 read_and_call_until(uhandle, consumer.query_info, blank=1) 194 read_and_call_while(uhandle, consumer.noevent, blank=1) 195 196 # Read the database lines and the following blank line. 197 read_and_call_until(uhandle, consumer.database_info, end='total letters') 198 read_and_call(uhandle, consumer.database_info, contains='sequences') 199 read_and_call_while(uhandle, consumer.noevent, blank=1) 200 elif line.startswith("Database:"): 201 # This is a new style database then query... 202 read_and_call_until(uhandle, consumer.database_info, end='total letters') 203 read_and_call(uhandle, consumer.database_info, contains='sequences') 204 read_and_call_while(uhandle, consumer.noevent, blank=1) 205 206 # Read the Query lines and the following blank line. 207 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 208 # the "... Score E" line instead. 209 read_and_call(uhandle, consumer.query_info, start='Query=') 210 # BLAST 2.2.25+ has a blank line before Length= 211 read_and_call_until(uhandle, consumer.query_info, start='Length=') 212 while True: 213 line = uhandle.peekline() 214 if not line.strip() or "Score E" in line: 215 break 216 # It is more of the query (and its length) 217 read_and_call(uhandle, consumer.query_info) 218 read_and_call_while(uhandle, consumer.noevent, blank=1) 219 else: 220 raise ValueError("Invalid header?") 221 222 consumer.end_header()
223
224 - def _scan_rounds(self, uhandle, consumer):
225 # Scan a bunch of rounds. 226 # Each round begins with either a "Searching......" line 227 # or a 'Score E' line followed by descriptions and alignments. 228 # The email server doesn't give the "Searching....." line. 229 # If there is no 'Searching.....' line then you'll first see a 230 # 'Results from round' line 231 232 while not self._eof(uhandle): 233 line = safe_peekline(uhandle) 234 if not line.startswith('Searching') and \ 235 not line.startswith('Results from round') and \ 236 re.search(r"Score +E", line) is None and \ 237 'No hits found' not in line: 238 break 239 self._scan_descriptions(uhandle, consumer) 240 self._scan_alignments(uhandle, consumer)
241
242 - def _scan_descriptions(self, uhandle, consumer):
243 # Searching..................................................done 244 # Results from round 2 245 # 246 # 247 # Sc 248 # Sequences producing significant alignments: (b 249 # Sequences used in model and found again: 250 # 251 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 252 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 253 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 254 # 255 # Sequences not found previously or not previously below threshold: 256 # 257 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 258 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 259 # 260 261 # If PSI-BLAST, may also have: 262 # 263 # CONVERGED! 264 265 consumer.start_descriptions() 266 267 # Read 'Searching' 268 # This line seems to be missing in BLASTN 2.1.2 (others?) 269 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 270 271 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 272 # If this happens, the handle will yield no more information. 273 if not uhandle.peekline(): 274 raise ValueError("Unexpected end of blast report. " + 275 "Looks suspiciously like a PSI-BLAST crash.") 276 277 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 278 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 279 # [blastall] ERROR: [000.000] AT1G08320: Blast: 280 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 281 # done 282 # Reported by David Weisman. 283 # Check for these error lines and ignore them for now. Let 284 # the BlastErrorParser deal with them. 285 line = uhandle.peekline() 286 if "ERROR:" in line or line.startswith("done"): 287 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 288 read_and_call(uhandle, consumer.noevent, start="done") 289 290 # Check to see if this is PSI-BLAST. 291 # If it is, the 'Searching' line will be followed by: 292 # (version 2.0.10) 293 # Searching............................. 294 # Results from round 2 295 # or (version 2.0.11) 296 # Searching............................. 297 # 298 # 299 # Results from round 2 300 301 # Skip a bunch of blank lines. 302 read_and_call_while(uhandle, consumer.noevent, blank=1) 303 # Check for the results line if it's there. 304 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 305 read_and_call_while(uhandle, consumer.noevent, blank=1) 306 307 # Three things can happen here: 308 # 1. line contains 'Score E' 309 # 2. line contains "No hits found" 310 # 3. no descriptions 311 # The first one begins a bunch of descriptions. The last two 312 # indicates that no descriptions follow, and we should go straight 313 # to the alignments. 314 if not attempt_read_and_call( 315 uhandle, consumer.description_header, 316 has_re=re.compile(r'Score +E')): 317 # Either case 2 or 3. Look for "No hits found". 318 attempt_read_and_call(uhandle, consumer.no_hits, 319 contains='No hits found') 320 try: 321 read_and_call_while(uhandle, consumer.noevent, blank=1) 322 except ValueError as err: 323 if str(err) != "Unexpected end of stream.": 324 raise err 325 326 consumer.end_descriptions() 327 # Stop processing. 328 return 329 330 # Read the score header lines 331 read_and_call(uhandle, consumer.description_header, 332 start='Sequences producing') 333 334 # If PSI-BLAST, read the 'Sequences used in model' line. 335 attempt_read_and_call(uhandle, consumer.model_sequences, 336 start='Sequences used in model') 337 read_and_call_while(uhandle, consumer.noevent, blank=1) 338 339 # In BLAT, rather than a "No hits found" line, we just 340 # get no descriptions (and no alignments). This can be 341 # spotted because the next line is the database block: 342 if safe_peekline(uhandle).startswith(" Database:"): 343 consumer.end_descriptions() 344 # Stop processing. 345 return 346 347 # Read the descriptions and the following blank lines, making 348 # sure that there are descriptions. 349 if not uhandle.peekline().startswith('Sequences not found'): 350 read_and_call_until(uhandle, consumer.description, blank=1) 351 read_and_call_while(uhandle, consumer.noevent, blank=1) 352 353 # If PSI-BLAST, read the 'Sequences not found' line followed 354 # by more descriptions. However, I need to watch out for the 355 # case where there were no sequences not found previously, in 356 # which case there will be no more descriptions. 357 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 358 start='Sequences not found'): 359 # Read the descriptions and the following blank lines. 360 read_and_call_while(uhandle, consumer.noevent, blank=1) 361 l = safe_peekline(uhandle) 362 # Brad -- added check for QUERY. On some PSI-BLAST outputs 363 # there will be a 'Sequences not found' line followed by no 364 # descriptions. Check for this case since the first thing you'll 365 # get is a blank line and then 'QUERY' 366 if not l.startswith('CONVERGED') and l[0] != '>' \ 367 and not l.startswith('QUERY'): 368 read_and_call_until(uhandle, consumer.description, blank=1) 369 read_and_call_while(uhandle, consumer.noevent, blank=1) 370 371 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 372 read_and_call_while(uhandle, consumer.noevent, blank=1) 373 374 consumer.end_descriptions()
375
376 - def _scan_alignments(self, uhandle, consumer):
377 if self._eof(uhandle): 378 return 379 380 # qblast inserts a helpful line here. 381 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 382 383 # First, check to see if I'm at the database report. 384 line = safe_peekline(uhandle) 385 if not line: 386 # EOF 387 return 388 elif line.startswith(' Database') or line.startswith("Lambda"): 389 return 390 elif line[0] == '>': 391 # XXX make a better check here between pairwise and masterslave 392 self._scan_pairwise_alignments(uhandle, consumer) 393 elif line.startswith('Effective'): 394 return 395 else: 396 # XXX put in a check to make sure I'm in a masterslave alignment 397 self._scan_masterslave_alignment(uhandle, consumer)
398
399 - def _scan_pairwise_alignments(self, uhandle, consumer):
400 while not self._eof(uhandle): 401 line = safe_peekline(uhandle) 402 if line[0] != '>': 403 break 404 self._scan_one_pairwise_alignment(uhandle, consumer)
405
406 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
407 if self._eof(uhandle): 408 return 409 consumer.start_alignment() 410 411 self._scan_alignment_header(uhandle, consumer) 412 413 # Scan a bunch of score/alignment pairs. 414 while True: 415 if self._eof(uhandle): 416 # Shouldn't have issued that _scan_alignment_header event... 417 break 418 line = safe_peekline(uhandle) 419 if not line.startswith(' Score'): 420 break 421 self._scan_hsp(uhandle, consumer) 422 consumer.end_alignment()
423
424 - def _scan_alignment_header(self, uhandle, consumer):
425 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 426 # stearothermophilus] 427 # Length = 81 428 # 429 # Or, more recently with different white space: 430 # 431 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 432 # gi|15829258|ref|NP_308031.1| threonine synthase 433 # ... 434 # Length=428 435 read_and_call(uhandle, consumer.title, start='>') 436 while True: 437 line = safe_readline(uhandle) 438 if line.lstrip().startswith('Length =') \ 439 or line.lstrip().startswith('Length='): 440 consumer.length(line) 441 break 442 elif is_blank_line(line): 443 # Check to make sure I haven't missed the Length line 444 raise ValueError("I missed the Length in an alignment header") 445 consumer.title(line) 446 447 # Older versions of BLAST will have a line with some spaces. 448 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 449 if not attempt_read_and_call(uhandle, consumer.noevent, 450 start=' '): 451 read_and_call(uhandle, consumer.noevent, blank=1)
452
453 - def _scan_hsp(self, uhandle, consumer):
454 consumer.start_hsp() 455 self._scan_hsp_header(uhandle, consumer) 456 self._scan_hsp_alignment(uhandle, consumer) 457 consumer.end_hsp()
458
459 - def _scan_hsp_header(self, uhandle, consumer):
460 # Score = 22.7 bits (47), Expect = 2.5 461 # Identities = 10/36 (27%), Positives = 18/36 (49%) 462 # Strand = Plus / Plus 463 # Frame = +3 464 # 465 466 read_and_call(uhandle, consumer.score, start=' Score') 467 read_and_call(uhandle, consumer.identities, start=' Identities') 468 # BLASTN 469 attempt_read_and_call(uhandle, consumer.strand, start=' Strand') 470 # BLASTX, TBLASTN, TBLASTX 471 attempt_read_and_call(uhandle, consumer.frame, start=' Frame') 472 read_and_call(uhandle, consumer.noevent, blank=1)
473
474 - def _scan_hsp_alignment(self, uhandle, consumer):
475 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 476 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 477 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 478 # 479 # Query: 64 AEKILIKR 71 480 # I +K 481 # Sbjct: 70 PNIIQLKD 77 482 # 483 484 while True: 485 # Blastn adds an extra line filled with spaces before Query 486 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 487 read_and_call(uhandle, consumer.query, start='Query') 488 read_and_call(uhandle, consumer.align, start=' ') 489 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 490 try: 491 read_and_call_while(uhandle, consumer.noevent, blank=1) 492 except ValueError as err: 493 if str(err) != "Unexpected end of stream.": 494 raise err 495 # End of File (well, it looks like it with recent versions 496 # of BLAST for multiple queries after the Iterator class 497 # has broken up the whole file into chunks). 498 break 499 line = safe_peekline(uhandle) 500 # Alignment continues if I see a 'Query' or the spaces for Blastn. 501 if not (line.startswith('Query') or line.startswith(' ')): 502 break
503
504 - def _scan_masterslave_alignment(self, uhandle, consumer):
505 consumer.start_alignment() 506 while True: 507 line = safe_readline(uhandle) 508 # Check to see whether I'm finished reading the alignment. 509 # This is indicated by 1) database section, 2) next psi-blast 510 # round, which can also be a 'Results from round' if no 511 # searching line is present 512 # patch by chapmanb 513 if line.startswith('Searching') or \ 514 line.startswith('Results from round'): 515 uhandle.saveline(line) 516 break 517 elif line.startswith(' Database'): 518 uhandle.saveline(line) 519 break 520 elif is_blank_line(line): 521 consumer.noevent(line) 522 else: 523 consumer.multalign(line) 524 read_and_call_while(uhandle, consumer.noevent, blank=1) 525 consumer.end_alignment()
526
527 - def _eof(self, uhandle):
528 try: 529 line = safe_peekline(uhandle) 530 except ValueError as err: 531 if str(err) != "Unexpected end of stream.": 532 raise err 533 line = "" 534 return not line
535
536 - def _scan_database_report(self, uhandle, consumer):
537 # Database: sdqib40-1.35.seg.fa 538 # Posted date: Nov 1, 1999 4:25 PM 539 # Number of letters in database: 223,339 540 # Number of sequences in database: 1323 541 # 542 # Lambda K H 543 # 0.322 0.133 0.369 544 # 545 # Gapped 546 # Lambda K H 547 # 0.270 0.0470 0.230 548 # 549 ########################################## 550 # Or, more recently Blast 2.2.15 gives less blank lines 551 ########################################## 552 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 553 # environmental samples 554 # Posted date: Dec 12, 2006 5:51 PM 555 # Number of letters in database: 667,088,753 556 # Number of sequences in database: 2,094,974 557 # Lambda K H 558 # 0.319 0.136 0.395 559 # Gapped 560 # Lambda K H 561 # 0.267 0.0410 0.140 562 563 if self._eof(uhandle): 564 return 565 566 consumer.start_database_report() 567 568 # Subset of the database(s) listed below 569 # Number of letters searched: 562,618,960 570 # Number of sequences searched: 228,924 571 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 572 read_and_call(uhandle, consumer.noevent, contains="letters") 573 read_and_call(uhandle, consumer.noevent, contains="sequences") 574 read_and_call(uhandle, consumer.noevent, start=" ") 575 576 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 577 # was missing the "Database" stanza completely. 578 while attempt_read_and_call(uhandle, consumer.database, 579 start=' Database'): 580 # BLAT output ends abruptly here, without any of the other 581 # information. Check to see if this is the case. If so, 582 # then end the database report here gracefully. 583 if not uhandle.peekline().strip() \ 584 or uhandle.peekline().startswith("BLAST"): 585 consumer.end_database_report() 586 return 587 588 # Database can span multiple lines. 589 read_and_call_until(uhandle, consumer.database, start=' Posted') 590 read_and_call(uhandle, consumer.posted_date, start=' Posted') 591 read_and_call(uhandle, consumer.num_letters_in_database, 592 start=' Number of letters') 593 read_and_call(uhandle, consumer.num_sequences_in_database, 594 start=' Number of sequences') 595 # There may not be a line starting with spaces... 596 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 597 598 line = safe_readline(uhandle) 599 uhandle.saveline(line) 600 if 'Lambda' in line: 601 break 602 603 try: 604 read_and_call(uhandle, consumer.noevent, start='Lambda') 605 read_and_call(uhandle, consumer.ka_params) 606 except: 607 pass 608 609 # This blank line is optional: 610 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 611 612 # not BLASTP 613 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 614 # not TBLASTX 615 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 616 read_and_call(uhandle, consumer.ka_params_gap) 617 618 # Blast 2.2.4 can sometimes skip the whole parameter section. 619 # Thus, I need to be careful not to read past the end of the 620 # file. 621 try: 622 read_and_call_while(uhandle, consumer.noevent, blank=1) 623 except ValueError as x: 624 if str(x) != "Unexpected end of stream.": 625 raise 626 consumer.end_database_report()
627
628 - def _scan_parameters(self, uhandle, consumer):
629 # Matrix: BLOSUM62 630 # Gap Penalties: Existence: 11, Extension: 1 631 # Number of Hits to DB: 50604 632 # Number of Sequences: 1323 633 # Number of extensions: 1526 634 # Number of successful extensions: 6 635 # Number of sequences better than 10.0: 5 636 # Number of HSP's better than 10.0 without gapping: 5 637 # Number of HSP's successfully gapped in prelim test: 0 638 # Number of HSP's that attempted gapping in prelim test: 1 639 # Number of HSP's gapped (non-prelim): 5 640 # length of query: 140 641 # length of database: 223,339 642 # effective HSP length: 39 643 # effective length of query: 101 644 # effective length of database: 171,742 645 # effective search space: 17345942 646 # effective search space used: 17345942 647 # T: 11 648 # A: 40 649 # X1: 16 ( 7.4 bits) 650 # X2: 38 (14.8 bits) 651 # X3: 64 (24.9 bits) 652 # S1: 41 (21.9 bits) 653 # S2: 42 (20.8 bits) 654 ########################################## 655 # Or, more recently Blast(x) 2.2.15 gives 656 ########################################## 657 # Matrix: BLOSUM62 658 # Gap Penalties: Existence: 11, Extension: 1 659 # Number of Sequences: 4535438 660 # Number of Hits to DB: 2,588,844,100 661 # Number of extensions: 60427286 662 # Number of successful extensions: 126433 663 # Number of sequences better than 2.0: 30 664 # Number of HSP's gapped: 126387 665 # Number of HSP's successfully gapped: 35 666 # Length of query: 291 667 # Length of database: 1,573,298,872 668 # Length adjustment: 130 669 # Effective length of query: 161 670 # Effective length of database: 983,691,932 671 # Effective search space: 158374401052 672 # Effective search space used: 158374401052 673 # Neighboring words threshold: 12 674 # Window for multiple hits: 40 675 # X1: 16 ( 7.3 bits) 676 # X2: 38 (14.6 bits) 677 # X3: 64 (24.7 bits) 678 # S1: 41 (21.7 bits) 679 # S2: 32 (16.9 bits) 680 681 # Blast 2.2.4 can sometimes skip the whole parameter section. 682 # BLAT also skips the whole parameter section. 683 # Thus, check to make sure that the parameter section really 684 # exists. 685 if not uhandle.peekline().strip(): 686 return 687 688 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 689 # "Number of Sequences" lines. 690 consumer.start_parameters() 691 692 # Matrix line may be missing in BLASTN 2.2.9 693 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 694 # not TBLASTX 695 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 696 697 attempt_read_and_call(uhandle, consumer.num_sequences, 698 start='Number of Sequences') 699 attempt_read_and_call(uhandle, consumer.num_hits, 700 start='Number of Hits') 701 attempt_read_and_call(uhandle, consumer.num_sequences, 702 start='Number of Sequences') 703 attempt_read_and_call(uhandle, consumer.num_extends, 704 start='Number of extensions') 705 attempt_read_and_call(uhandle, consumer.num_good_extends, 706 start='Number of successful') 707 708 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 709 start='Number of sequences') 710 711 # not BLASTN, TBLASTX 712 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 713 start="Number of HSP's better"): 714 # BLASTN 2.2.9 715 if attempt_read_and_call(uhandle, consumer.noevent, 716 start="Number of HSP's gapped:"): 717 read_and_call(uhandle, consumer.noevent, 718 start="Number of HSP's successfully") 719 # This is omitted in 2.2.15 720 attempt_read_and_call(uhandle, consumer.noevent, 721 start="Number of extra gapped extensions") 722 else: 723 read_and_call(uhandle, consumer.hsps_prelim_gapped, 724 start="Number of HSP's successfully") 725 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 726 start="Number of HSP's that") 727 read_and_call(uhandle, consumer.hsps_gapped, 728 start="Number of HSP's gapped") 729 # e.g. BLASTX 2.2.15 where the "better" line is missing 730 elif attempt_read_and_call(uhandle, consumer.noevent, 731 start="Number of HSP's gapped"): 732 read_and_call(uhandle, consumer.noevent, 733 start="Number of HSP's successfully") 734 735 # not in blastx 2.2.1 736 attempt_read_and_call(uhandle, consumer.query_length, 737 has_re=re.compile(r"[Ll]ength of query")) 738 # Not in BLASTX 2.2.22+ 739 attempt_read_and_call(uhandle, consumer.database_length, 740 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 741 742 # BLASTN 2.2.9 743 attempt_read_and_call(uhandle, consumer.noevent, 744 start="Length adjustment") 745 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 746 start='effective HSP') 747 # Not in blastx 2.2.1 748 attempt_read_and_call( 749 uhandle, consumer.effective_query_length, 750 has_re=re.compile(r'[Ee]ffective length of query')) 751 752 # This is not in BLASTP 2.2.15 753 attempt_read_and_call( 754 uhandle, consumer.effective_database_length, 755 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 756 # Not in blastx 2.2.1, added a ':' to distinguish between 757 # this and the 'effective search space used' line 758 attempt_read_and_call( 759 uhandle, consumer.effective_search_space, 760 has_re=re.compile(r'[Ee]ffective search space:')) 761 # Does not appear in BLASTP 2.0.5 762 attempt_read_and_call( 763 uhandle, consumer.effective_search_space_used, 764 has_re=re.compile(r'[Ee]ffective search space used')) 765 766 # BLASTX, TBLASTN, TBLASTX 767 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 768 769 # not in BLASTN 2.2.9 770 attempt_read_and_call(uhandle, consumer.threshold, start='T') 771 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 772 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 773 774 # not in BLASTX 2.2.15 775 attempt_read_and_call(uhandle, consumer.window_size, start='A') 776 # get this instead: "Window for multiple hits: 40" 777 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 778 779 # not in BLASTX 2.2.22+ 780 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 781 # not TBLASTN 782 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 783 784 # not BLASTN, TBLASTX 785 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 786 start='X3') 787 788 # not TBLASTN 789 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 790 # not in blastx 2.2.1 791 # first we make sure we have additional lines to work with, if 792 # not then the file is done and we don't have a final S2 793 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 794 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 795 796 consumer.end_parameters()
797 798
799 -class BlastParser(AbstractParser):
800 """Parses BLAST data into a Record.Blast object. 801 802 """
803 - def __init__(self):
804 """__init__(self)""" 805 self._scanner = _Scanner() 806 self._consumer = _BlastConsumer()
807
808 - def parse(self, handle):
809 """parse(self, handle)""" 810 self._scanner.feed(handle, self._consumer) 811 return self._consumer.data
812 813
814 -class PSIBlastParser(AbstractParser):
815 """Parses BLAST data into a Record.PSIBlast object. 816 817 """
818 - def __init__(self):
819 """__init__(self)""" 820 self._scanner = _Scanner() 821 self._consumer = _PSIBlastConsumer()
822
823 - def parse(self, handle):
824 """parse(self, handle)""" 825 self._scanner.feed(handle, self._consumer) 826 return self._consumer.data
827 828
829 -class _HeaderConsumer(object):
830 - def start_header(self):
831 self._header = Record.Header()
832
833 - def version(self, line):
834 c = line.split() 835 self._header.application = c[0] 836 self._header.version = c[1] 837 if len(c) > 2: 838 # The date is missing in the new C++ output from blastx 2.2.22+ 839 # Just get "BLASTX 2.2.22+\n" and that's all. 840 self._header.date = c[2][1:-1]
841
842 - def reference(self, line):
843 if line.startswith('Reference: '): 844 self._header.reference = line[11:] 845 else: 846 self._header.reference = self._header.reference + line
847
848 - def query_info(self, line):
849 if line.startswith('Query= '): 850 self._header.query = line[7:].lstrip() 851 elif line.startswith('Length='): 852 # New style way to give the query length in BLAST 2.2.22+ (the C++ code) 853 self._header.query_letters = _safe_int(line[7:].strip()) 854 elif not line.startswith(' '): # continuation of query_info 855 self._header.query = "%s%s" % (self._header.query, line) 856 else: 857 # Hope it is the old style way to give the query length: 858 letters, = _re_search( 859 r"([0-9,]+) letters", line, 860 "I could not find the number of letters in line\n%s" % line) 861 self._header.query_letters = _safe_int(letters)
862
863 - def database_info(self, line):
864 line = line.rstrip() 865 if line.startswith('Database: '): 866 self._header.database = line[10:] 867 elif not line.endswith('total letters'): 868 if self._header.database: 869 # Need to include a space when merging multi line datase descr 870 self._header.database = self._header.database + " " + line.strip() 871 else: 872 self._header.database = line.strip() 873 else: 874 sequences, letters = _re_search( 875 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 876 "I could not find the sequences and letters in line\n%s" % line) 877 self._header.database_sequences = _safe_int(sequences) 878 self._header.database_letters = _safe_int(letters)
879
880 - def end_header(self):
881 # Get rid of the trailing newlines 882 self._header.reference = self._header.reference.rstrip() 883 self._header.query = self._header.query.rstrip()
884 885
886 -class _DescriptionConsumer(object):
887 - def start_descriptions(self):
888 self._descriptions = [] 889 self._model_sequences = [] 890 self._nonmodel_sequences = [] 891 self._converged = 0 892 self._type = None 893 self._roundnum = None 894 895 self.__has_n = 0 # Does the description line contain an N value?
896
897 - def description_header(self, line):
898 if line.startswith('Sequences producing'): 899 cols = line.split() 900 if cols[-1] == 'N': 901 self.__has_n = 1
902
903 - def description(self, line):
904 dh = self._parse(line) 905 if self._type == 'model': 906 self._model_sequences.append(dh) 907 elif self._type == 'nonmodel': 908 self._nonmodel_sequences.append(dh) 909 else: 910 self._descriptions.append(dh)
911
912 - def model_sequences(self, line):
913 self._type = 'model'
914
915 - def nonmodel_sequences(self, line):
916 self._type = 'nonmodel'
917
918 - def converged(self, line):
919 self._converged = 1
920
921 - def no_hits(self, line):
922 pass
923
924 - def round(self, line):
925 if not line.startswith('Results from round'): 926 raise ValueError("I didn't understand the round line\n%s" % line) 927 self._roundnum = _safe_int(line[18:].strip())
928
929 - def end_descriptions(self):
930 pass
931
932 - def _parse(self, description_line):
933 line = description_line # for convenience 934 dh = Record.Description() 935 936 # I need to separate the score and p-value from the title. 937 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 938 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 939 # special cases to handle: 940 # - title must be preserved exactly (including whitespaces) 941 # - score could be equal to e-value (not likely, but what if??) 942 # - sometimes there's an "N" score of '1'. 943 cols = line.split() 944 if len(cols) < 3: 945 raise ValueError( 946 "Line does not appear to contain description:\n%s" % line) 947 if self.__has_n: 948 i = line.rfind(cols[-1]) # find start of N 949 i = line.rfind(cols[-2], 0, i) # find start of p-value 950 i = line.rfind(cols[-3], 0, i) # find start of score 951 else: 952 i = line.rfind(cols[-1]) # find start of p-value 953 i = line.rfind(cols[-2], 0, i) # find start of score 954 if self.__has_n: 955 dh.title, dh.score, dh.e, dh.num_alignments = \ 956 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 957 else: 958 dh.title, dh.score, dh.e, dh.num_alignments = \ 959 line[:i].rstrip(), cols[-2], cols[-1], 1 960 dh.num_alignments = _safe_int(dh.num_alignments) 961 dh.score = _safe_int(dh.score) 962 dh.e = _safe_float(dh.e) 963 return dh
964 965
966 -class _AlignmentConsumer(object):
967 # This is a little bit tricky. An alignment can either be a 968 # pairwise alignment or a multiple alignment. Since it's difficult 969 # to know a-priori which one the blast record will contain, I'm going 970 # to make one class that can parse both of them.
971 - def start_alignment(self):
972 self._alignment = Record.Alignment() 973 self._multiple_alignment = Record.MultipleAlignment()
974
975 - def title(self, line):
976 if self._alignment.title: 977 self._alignment.title += " " 978 self._alignment.title += line.strip()
979
980 - def length(self, line):
981 # e.g. "Length = 81" or more recently, "Length=428" 982 parts = line.replace(" ", "").split("=") 983 assert len(parts) == 2, "Unrecognised format length line" 984 self._alignment.length = parts[1] 985 self._alignment.length = _safe_int(self._alignment.length)
986
987 - def multalign(self, line):
988 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 989 if line.startswith('QUERY') or line.startswith('blast_tmp'): 990 # If this is the first line of the multiple alignment, 991 # then I need to figure out how the line is formatted. 992 993 # Format of line is: 994 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 995 try: 996 name, start, seq, end = line.split() 997 except ValueError: 998 raise ValueError("I do not understand the line\n%s" % line) 999 self._start_index = line.index(start, len(name)) 1000 self._seq_index = line.index(seq, 1001 self._start_index + len(start)) 1002 # subtract 1 for the space 1003 self._name_length = self._start_index - 1 1004 self._start_length = self._seq_index - self._start_index - 1 1005 self._seq_length = line.rfind(end) - self._seq_index - 1 1006 1007 # self._seq_index = line.index(seq) 1008 # # subtract 1 for the space 1009 # self._seq_length = line.rfind(end) - self._seq_index - 1 1010 # self._start_index = line.index(start) 1011 # self._start_length = self._seq_index - self._start_index - 1 1012 # self._name_length = self._start_index 1013 1014 # Extract the information from the line 1015 name = line[:self._name_length] 1016 name = name.rstrip() 1017 start = line[self._start_index:self._start_index + self._start_length] 1018 start = start.rstrip() 1019 if start: 1020 start = _safe_int(start) 1021 end = line[self._seq_index + self._seq_length:].rstrip() 1022 if end: 1023 end = _safe_int(end) 1024 seq = line[self._seq_index:self._seq_index + self._seq_length].rstrip() 1025 # right pad the sequence with spaces if necessary 1026 if len(seq) < self._seq_length: 1027 seq += ' ' * (self._seq_length - len(seq)) 1028 1029 # I need to make sure the sequence is aligned correctly with the query. 1030 # First, I will find the length of the query. Then, if necessary, 1031 # I will pad my current sequence with spaces so that they will line 1032 # up correctly. 1033 1034 # Two possible things can happen: 1035 # QUERY 1036 # 504 1037 # 1038 # QUERY 1039 # 403 1040 # 1041 # Sequence 504 will need padding at the end. Since I won't know 1042 # this until the end of the alignment, this will be handled in 1043 # end_alignment. 1044 # Sequence 403 will need padding before being added to the alignment. 1045 1046 align = self._multiple_alignment.alignment # for convenience 1047 align.append((name, start, seq, end))
1048 1049 # This is old code that tried to line up all the sequences 1050 # in a multiple alignment by using the sequence title's as 1051 # identifiers. The problem with this is that BLAST assigns 1052 # different HSP's from the same sequence the same id. Thus, 1053 # in one alignment block, there may be multiple sequences with 1054 # the same id. I'm not sure how to handle this, so I'm not 1055 # going to. 1056 1057 # # If the sequence is the query, then just add it. 1058 # if name == 'QUERY': 1059 # if len(align) == 0: 1060 # align.append((name, start, seq)) 1061 # else: 1062 # aname, astart, aseq = align[0] 1063 # if name != aname: 1064 # raise ValueError, "Query is not the first sequence" 1065 # aseq = aseq + seq 1066 # align[0] = aname, astart, aseq 1067 # else: 1068 # if len(align) == 0: 1069 # raise ValueError, "I could not find the query sequence" 1070 # qname, qstart, qseq = align[0] 1071 # 1072 # # Now find my sequence in the multiple alignment. 1073 # for i in range(1, len(align)): 1074 # aname, astart, aseq = align[i] 1075 # if name == aname: 1076 # index = i 1077 # break 1078 # else: 1079 # # If I couldn't find it, then add a new one. 1080 # align.append((None, None, None)) 1081 # index = len(align)-1 1082 # # Make sure to left-pad it. 1083 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1084 # 1085 # if len(qseq) != len(aseq) + len(seq): 1086 # # If my sequences are shorter than the query sequence, 1087 # # then I will need to pad some spaces to make them line up. 1088 # # Since I've already right padded seq, that means aseq 1089 # # must be too short. 1090 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1091 # aseq = aseq + seq 1092 # if astart is None: 1093 # astart = start 1094 # align[index] = aname, astart, aseq 1095
1096 - def end_alignment(self):
1097 # Remove trailing newlines 1098 if self._alignment: 1099 self._alignment.title = self._alignment.title.rstrip() 1100 1101 # This code is also obsolete. See note above. 1102 # If there's a multiple alignment, I will need to make sure 1103 # all the sequences are aligned. That is, I may need to 1104 # right-pad the sequences. 1105 # if self._multiple_alignment is not None: 1106 # align = self._multiple_alignment.alignment 1107 # seqlen = None 1108 # for i in range(len(align)): 1109 # name, start, seq = align[i] 1110 # if seqlen is None: 1111 # seqlen = len(seq) 1112 # else: 1113 # if len(seq) < seqlen: 1114 # seq = seq + ' '*(seqlen - len(seq)) 1115 # align[i] = name, start, seq 1116 # elif len(seq) > seqlen: 1117 # raise ValueError, \ 1118 # "Sequence %s is longer than the query" % name 1119 1120 # Clean up some variables, if they exist. 1121 try: 1122 del self._seq_index 1123 del self._seq_length 1124 del self._start_index 1125 del self._start_length 1126 del self._name_length 1127 except AttributeError: 1128 pass
1129 1130
1131 -class _HSPConsumer(object):
1132 - def start_hsp(self):
1133 self._hsp = Record.HSP()
1134
1135 - def score(self, line):
1136 self._hsp.bits, self._hsp.score = _re_search( 1137 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1138 "I could not find the score in line\n%s" % line) 1139 self._hsp.score = _safe_float(self._hsp.score) 1140 self._hsp.bits = _safe_float(self._hsp.bits) 1141 1142 x, y = _re_search( 1143 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1144 "I could not find the expect in line\n%s" % line) 1145 if x: 1146 self._hsp.num_alignments = _safe_int(x) 1147 else: 1148 self._hsp.num_alignments = 1 1149 self._hsp.expect = _safe_float(y)
1150
1151 - def identities(self, line):
1152 x, y = _re_search( 1153 r"Identities = (\d+)\/(\d+)", line, 1154 "I could not find the identities in line\n%s" % line) 1155 self._hsp.identities = _safe_int(x), _safe_int(y) 1156 self._hsp.align_length = _safe_int(y) 1157 1158 if 'Positives' in line: 1159 x, y = _re_search( 1160 r"Positives = (\d+)\/(\d+)", line, 1161 "I could not find the positives in line\n%s" % line) 1162 self._hsp.positives = _safe_int(x), _safe_int(y) 1163 assert self._hsp.align_length == _safe_int(y) 1164 1165 if 'Gaps' in line: 1166 x, y = _re_search( 1167 r"Gaps = (\d+)\/(\d+)", line, 1168 "I could not find the gaps in line\n%s" % line) 1169 self._hsp.gaps = _safe_int(x), _safe_int(y) 1170 assert self._hsp.align_length == _safe_int(y)
1171
1172 - def strand(self, line):
1173 self._hsp.strand = _re_search( 1174 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1175 "I could not find the strand in line\n%s" % line)
1176
1177 - def frame(self, line):
1178 # Frame can be in formats: 1179 # Frame = +1 1180 # Frame = +2 / +2 1181 if '/' in line: 1182 self._hsp.frame = _re_search( 1183 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1184 "I could not find the frame in line\n%s" % line) 1185 else: 1186 self._hsp.frame = _re_search( 1187 r"Frame = ([-+][123])", line, 1188 "I could not find the frame in line\n%s" % line)
1189 1190 # Match a space, if one is available. Masahir Ishikawa found a 1191 # case where there's no space between the start and the sequence: 1192 # Query: 100tt 101 1193 # line below modified by Yair Benita, Sep 2004 1194 # Note that the colon is not always present. 2006 1195 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1196
1197 - def query(self, line):
1198 m = self._query_re.search(line) 1199 if m is None: 1200 raise ValueError("I could not find the query in line\n%s" % line) 1201 1202 # line below modified by Yair Benita, Sep 2004. 1203 # added the end attribute for the query 1204 colon, start, seq, end = m.groups() 1205 self._hsp.query = self._hsp.query + seq 1206 if self._hsp.query_start is None: 1207 self._hsp.query_start = _safe_int(start) 1208 1209 # line below added by Yair Benita, Sep 2004. 1210 # added the end attribute for the query 1211 self._hsp.query_end = _safe_int(end) 1212 1213 # Get index for sequence start (regular expression element 3) 1214 self._query_start_index = m.start(3) 1215 self._query_len = len(seq)
1216
1217 - def align(self, line):
1218 seq = line[self._query_start_index:].rstrip() 1219 if len(seq) < self._query_len: 1220 # Make sure the alignment is the same length as the query 1221 seq += ' ' * (self._query_len - len(seq)) 1222 elif len(seq) < self._query_len: 1223 raise ValueError("Match is longer than the query in line\n%s" 1224 % line) 1225 self._hsp.match = self._hsp.match + seq
1226 1227 # To match how we do the query, cache the regular expression. 1228 # Note that the colon is not always present. 1229 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1230
1231 - def sbjct(self, line):
1232 m = self._sbjct_re.search(line) 1233 if m is None: 1234 raise ValueError("I could not find the sbjct in line\n%s" % line) 1235 colon, start, seq, end = m.groups() 1236 # mikep 26/9/00 1237 # On occasion, there is a blast hit with no subject match 1238 # so far, it only occurs with 1-line short "matches" 1239 # I have decided to let these pass as they appear 1240 if not seq.strip(): 1241 seq = ' ' * self._query_len 1242 self._hsp.sbjct = self._hsp.sbjct + seq 1243 if self._hsp.sbjct_start is None: 1244 self._hsp.sbjct_start = _safe_int(start) 1245 1246 self._hsp.sbjct_end = _safe_int(end) 1247 if len(seq) != self._query_len: 1248 raise ValueError( 1249 "QUERY and SBJCT sequence lengths don't match in line\n%s" 1250 % line) 1251 1252 del self._query_start_index # clean up unused variables 1253 del self._query_len
1254
1255 - def end_hsp(self):
1256 pass
1257 1258
1259 -class _DatabaseReportConsumer(object):
1260
1261 - def start_database_report(self):
1262 self._dr = Record.DatabaseReport()
1263
1264 - def database(self, line):
1265 m = re.search(r"Database: (.+)$", line) 1266 if m: 1267 self._dr.database_name.append(m.group(1)) 1268 elif self._dr.database_name: 1269 # This must be a continuation of the previous name. 1270 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1271 line.strip())
1272
1273 - def posted_date(self, line):
1274 self._dr.posted_date.append(_re_search( 1275 r"Posted date:\s*(.+)$", line, 1276 "I could not find the posted date in line\n%s" % line))
1277
1278 - def num_letters_in_database(self, line):
1279 letters, = _get_cols( 1280 line, (-1,), ncols=6, expected={2: "letters", 4: "database:"}) 1281 self._dr.num_letters_in_database.append(_safe_int(letters))
1282
1283 - def num_sequences_in_database(self, line):
1284 sequences, = _get_cols( 1285 line, (-1,), ncols=6, expected={2: "sequences", 4: "database:"}) 1286 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1287
1288 - def ka_params(self, line):
1289 self._dr.ka_params = [_safe_float(x) for x in line.split()]
1290
1291 - def gapped(self, line):
1292 self._dr.gapped = 1
1293
1294 - def ka_params_gap(self, line):
1295 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1296
1297 - def end_database_report(self):
1298 pass
1299 1300
1301 -class _ParametersConsumer(object):
1302 - def start_parameters(self):
1303 self._params = Record.Parameters()
1304
1305 - def matrix(self, line):
1306 self._params.matrix = line[8:].rstrip()
1307
1308 - def gap_penalties(self, line):
1309 self._params.gap_penalties = [_safe_float(x) for x in _get_cols( 1310 line, (3, 5), ncols=6, expected={2: "Existence:", 4: "Extension:"})]
1311
1312 - def num_hits(self, line):
1313 if '1st pass' in line: 1314 x, = _get_cols(line, (-4,), ncols=11, expected={2: "Hits"}) 1315 self._params.num_hits = _safe_int(x) 1316 else: 1317 x, = _get_cols(line, (-1,), ncols=6, expected={2: "Hits"}) 1318 self._params.num_hits = _safe_int(x)
1319
1320 - def num_sequences(self, line):
1321 if '1st pass' in line: 1322 x, = _get_cols(line, (-4,), ncols=9, expected={2: "Sequences:"}) 1323 self._params.num_sequences = _safe_int(x) 1324 else: 1325 x, = _get_cols(line, (-1,), ncols=4, expected={2: "Sequences:"}) 1326 self._params.num_sequences = _safe_int(x)
1327
1328 - def num_extends(self, line):
1329 if '1st pass' in line: 1330 x, = _get_cols(line, (-4,), ncols=9, expected={2: "extensions:"}) 1331 self._params.num_extends = _safe_int(x) 1332 else: 1333 x, = _get_cols(line, (-1,), ncols=4, expected={2: "extensions:"}) 1334 self._params.num_extends = _safe_int(x)
1335
1336 - def num_good_extends(self, line):
1337 if '1st pass' in line: 1338 x, = _get_cols(line, (-4,), ncols=10, expected={3: "extensions:"}) 1339 self._params.num_good_extends = _safe_int(x) 1340 else: 1341 x, = _get_cols(line, (-1,), ncols=5, expected={3: "extensions:"}) 1342 self._params.num_good_extends = _safe_int(x)
1343
1344 - def num_seqs_better_e(self, line):
1345 self._params.num_seqs_better_e, = _get_cols( 1346 line, (-1,), ncols=7, expected={2: "sequences"}) 1347 self._params.num_seqs_better_e = _safe_int( 1348 self._params.num_seqs_better_e)
1349
1350 - def hsps_no_gap(self, line):
1351 self._params.hsps_no_gap, = _get_cols( 1352 line, (-1,), ncols=9, expected={3: "better", 7: "gapping:"}) 1353 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1354
1355 - def hsps_prelim_gapped(self, line):
1356 self._params.hsps_prelim_gapped, = _get_cols( 1357 line, (-1,), ncols=9, expected={4: "gapped", 6: "prelim"}) 1358 self._params.hsps_prelim_gapped = _safe_int( 1359 self._params.hsps_prelim_gapped)
1360
1361 - def hsps_prelim_gapped_attempted(self, line):
1362 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1363 line, (-1,), ncols=10, expected={4: "attempted", 7: "prelim"}) 1364 self._params.hsps_prelim_gapped_attempted = _safe_int( 1365 self._params.hsps_prelim_gapped_attempted)
1366
1367 - def hsps_gapped(self, line):
1368 self._params.hsps_gapped, = _get_cols( 1369 line, (-1,), ncols=6, expected={3: "gapped"}) 1370 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1371
1372 - def query_length(self, line):
1373 self._params.query_length, = _get_cols( 1374 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "query:"}) 1375 self._params.query_length = _safe_int(self._params.query_length)
1376
1377 - def database_length(self, line):
1378 self._params.database_length, = _get_cols( 1379 line.lower(), (-1,), ncols=4, expected={0: "length", 2: "database:"}) 1380 self._params.database_length = _safe_int(self._params.database_length)
1381
1382 - def effective_hsp_length(self, line):
1383 self._params.effective_hsp_length, = _get_cols( 1384 line, (-1,), ncols=4, expected={1: "HSP", 2: "length:"}) 1385 self._params.effective_hsp_length = _safe_int( 1386 self._params.effective_hsp_length)
1387
1388 - def effective_query_length(self, line):
1389 self._params.effective_query_length, = _get_cols( 1390 line, (-1,), ncols=5, expected={1: "length", 3: "query:"}) 1391 self._params.effective_query_length = _safe_int( 1392 self._params.effective_query_length)
1393
1394 - def effective_database_length(self, line):
1395 self._params.effective_database_length, = _get_cols( 1396 line.lower(), (-1,), ncols=5, expected={1: "length", 3: "database:"}) 1397 self._params.effective_database_length = _safe_int( 1398 self._params.effective_database_length)
1399
1400 - def effective_search_space(self, line):
1401 self._params.effective_search_space, = _get_cols( 1402 line, (-1,), ncols=4, expected={1: "search"}) 1403 self._params.effective_search_space = _safe_int( 1404 self._params.effective_search_space)
1405
1406 - def effective_search_space_used(self, line):
1407 self._params.effective_search_space_used, = _get_cols( 1408 line, (-1,), ncols=5, expected={1: "search", 3: "used:"}) 1409 self._params.effective_search_space_used = _safe_int( 1410 self._params.effective_search_space_used)
1411
1412 - def frameshift(self, line):
1413 self._params.frameshift = _get_cols( 1414 line, (4, 5), ncols=6, expected={0: "frameshift", 2: "decay"})
1415
1416 - def threshold(self, line):
1417 if line[:2] == "T:": 1418 # Assume its an old stlye line like "T: 123" 1419 self._params.threshold, = _get_cols( 1420 line, (1,), ncols=2, expected={0: "T:"}) 1421 elif line[:28] == "Neighboring words threshold:": 1422 self._params.threshold, = _get_cols( 1423 line, (3,), ncols=4, expected={0: "Neighboring", 1: "words", 2: "threshold:"}) 1424 else: 1425 raise ValueError("Unrecognised threshold line:\n%s" % line) 1426 self._params.threshold = _safe_int(self._params.threshold)
1427
1428 - def window_size(self, line):
1429 if line[:2] == "A:": 1430 self._params.window_size, = _get_cols( 1431 line, (1,), ncols=2, expected={0: "A:"}) 1432 elif line[:25] == "Window for multiple hits:": 1433 self._params.window_size, = _get_cols( 1434 line, (4,), ncols=5, expected={0: "Window", 2: "multiple", 3: "hits:"}) 1435 else: 1436 raise ValueError("Unrecognised window size line:\n%s" % line) 1437 self._params.window_size = _safe_int(self._params.window_size)
1438
1439 - def dropoff_1st_pass(self, line):
1440 score, bits = _re_search( 1441 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1442 "I could not find the dropoff in line\n%s" % line) 1443 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1444
1445 - def gap_x_dropoff(self, line):
1446 score, bits = _re_search( 1447 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1448 "I could not find the gap dropoff in line\n%s" % line) 1449 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1450
1451 - def gap_x_dropoff_final(self, line):
1452 score, bits = _re_search( 1453 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1454 "I could not find the gap dropoff final in line\n%s" % line) 1455 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1456
1457 - def gap_trigger(self, line):
1458 score, bits = _re_search( 1459 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1460 "I could not find the gap trigger in line\n%s" % line) 1461 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1462
1463 - def blast_cutoff(self, line):
1464 score, bits = _re_search( 1465 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1466 "I could not find the blast cutoff in line\n%s" % line) 1467 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1468
1469 - def end_parameters(self):
1470 pass
1471 1472
1473 -class _BlastConsumer(AbstractConsumer, 1474 _HeaderConsumer, 1475 _DescriptionConsumer, 1476 _AlignmentConsumer, 1477 _HSPConsumer, 1478 _DatabaseReportConsumer, 1479 _ParametersConsumer 1480 ):
1481 # This Consumer is inherits from many other consumer classes that handle 1482 # the actual dirty work. An alternate way to do it is to create objects 1483 # of those classes and then delegate the parsing tasks to them in a 1484 # decorator-type pattern. The disadvantage of that is that the method 1485 # names will need to be resolved in this classes. However, using 1486 # a decorator will retain more control in this class (which may or 1487 # may not be a bad thing). In addition, having each sub-consumer as 1488 # its own object prevents this object's dictionary from being cluttered 1489 # with members and reduces the chance of member collisions.
1490 - def __init__(self):
1491 self.data = None
1492
1493 - def round(self, line):
1494 # Make sure nobody's trying to pass me PSI-BLAST data! 1495 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1496
1497 - def start_header(self):
1498 self.data = Record.Blast() 1499 _HeaderConsumer.start_header(self)
1500
1501 - def end_header(self):
1502 _HeaderConsumer.end_header(self) 1503 self.data.__dict__.update(self._header.__dict__)
1504
1505 - def end_descriptions(self):
1506 self.data.descriptions = self._descriptions
1507
1508 - def end_alignment(self):
1509 _AlignmentConsumer.end_alignment(self) 1510 if self._alignment.hsps: 1511 self.data.alignments.append(self._alignment) 1512 if self._multiple_alignment.alignment: 1513 self.data.multiple_alignment = self._multiple_alignment
1514
1515 - def end_hsp(self):
1516 _HSPConsumer.end_hsp(self) 1517 try: 1518 self._alignment.hsps.append(self._hsp) 1519 except AttributeError: 1520 raise ValueError("Found an HSP before an alignment")
1521
1522 - def end_database_report(self):
1523 _DatabaseReportConsumer.end_database_report(self) 1524 self.data.__dict__.update(self._dr.__dict__)
1525
1526 - def end_parameters(self):
1527 _ParametersConsumer.end_parameters(self) 1528 self.data.__dict__.update(self._params.__dict__)
1529 1530
1531 -class _PSIBlastConsumer(AbstractConsumer, 1532 _HeaderConsumer, 1533 _DescriptionConsumer, 1534 _AlignmentConsumer, 1535 _HSPConsumer, 1536 _DatabaseReportConsumer, 1537 _ParametersConsumer 1538 ):
1539 - def __init__(self):
1540 self.data = None
1541
1542 - def start_header(self):
1543 self.data = Record.PSIBlast() 1544 _HeaderConsumer.start_header(self)
1545
1546 - def end_header(self):
1547 _HeaderConsumer.end_header(self) 1548 self.data.__dict__.update(self._header.__dict__)
1549
1550 - def start_descriptions(self):
1551 self._round = Record.Round() 1552 self.data.rounds.append(self._round) 1553 _DescriptionConsumer.start_descriptions(self)
1554
1555 - def end_descriptions(self):
1556 _DescriptionConsumer.end_descriptions(self) 1557 self._round.number = self._roundnum 1558 if self._descriptions: 1559 self._round.new_seqs.extend(self._descriptions) 1560 self._round.reused_seqs.extend(self._model_sequences) 1561 self._round.new_seqs.extend(self._nonmodel_sequences) 1562 if self._converged: 1563 self.data.converged = 1
1564
1565 - def end_alignment(self):
1566 _AlignmentConsumer.end_alignment(self) 1567 if self._alignment.hsps: 1568 self._round.alignments.append(self._alignment) 1569 if self._multiple_alignment: 1570 self._round.multiple_alignment = self._multiple_alignment
1571
1572 - def end_hsp(self):
1573 _HSPConsumer.end_hsp(self) 1574 try: 1575 self._alignment.hsps.append(self._hsp) 1576 except AttributeError: 1577 raise ValueError("Found an HSP before an alignment")
1578
1579 - def end_database_report(self):
1580 _DatabaseReportConsumer.end_database_report(self) 1581 self.data.__dict__.update(self._dr.__dict__)
1582
1583 - def end_parameters(self):
1584 _ParametersConsumer.end_parameters(self) 1585 self.data.__dict__.update(self._params.__dict__)
1586 1587
1588 -class Iterator(object):
1589 """Iterates over a file of multiple BLAST results. 1590 1591 Methods: 1592 next Return the next record from the stream, or None. 1593 1594 """
1595 - def __init__(self, handle, parser=None):
1596 """__init__(self, handle, parser=None) 1597 1598 Create a new iterator. handle is a file-like object. parser 1599 is an optional Parser object to change the results into another form. 1600 If set to None, then the raw contents of the file will be returned. 1601 1602 """ 1603 try: 1604 handle.readline 1605 except AttributeError: 1606 raise ValueError( 1607 "I expected a file handle or file-like object, got %s" 1608 % type(handle)) 1609 self._uhandle = File.UndoHandle(handle) 1610 self._parser = parser 1611 self._header = []
1612
1613 - def __next__(self):
1614 """next(self) -> object 1615 1616 Return the next Blast record from the file. If no more records, 1617 return None. 1618 1619 """ 1620 lines = [] 1621 query = False 1622 while True: 1623 line = self._uhandle.readline() 1624 if not line: 1625 break 1626 # If I've reached the next one, then put the line back and stop. 1627 if lines and (line.startswith('BLAST') 1628 or line.startswith('BLAST', 1) 1629 or line.startswith('<?xml ')): 1630 self._uhandle.saveline(line) 1631 break 1632 # New style files omit the BLAST line to mark a new query: 1633 if line.startswith("Query="): 1634 if not query: 1635 if not self._header: 1636 self._header = lines[:] 1637 query = True 1638 else: 1639 # Start of another record 1640 self._uhandle.saveline(line) 1641 break 1642 lines.append(line) 1643 1644 if query and "BLAST" not in lines[0]: 1645 # Cheat and re-insert the header 1646 # print "-"*50 1647 # print "".join(self._header) 1648 # print "-"*50 1649 # print "".join(lines) 1650 # print "-"*50 1651 lines = self._header + lines 1652 1653 if not lines: 1654 return None 1655 1656 data = ''.join(lines) 1657 if self._parser is not None: 1658 return self._parser.parse(StringIO(data)) 1659 return data
1660 1661 if sys.version_info[0] < 3:
1662 - def next(self):
1663 """Python 2 style alias for Python 3 style __next__ method.""" 1664 return self.__next__()
1665
1666 - def __iter__(self):
1667 return iter(self.__next__, None)
1668 1669
1670 -def _re_search(regex, line, error_msg):
1671 m = re.search(regex, line) 1672 if not m: 1673 raise ValueError(error_msg) 1674 return m.groups()
1675 1676
1677 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1678 cols = line.split() 1679 1680 # Check to make sure number of columns is correct 1681 if ncols is not None and len(cols) != ncols: 1682 raise ValueError("I expected %d columns (got %d) in line\n%s" 1683 % (ncols, len(cols), line)) 1684 1685 # Check to make sure columns contain the correct data 1686 for k in expected: 1687 if cols[k] != expected[k]: 1688 raise ValueError("I expected '%s' in column %d in line\n%s" 1689 % (expected[k], k, line)) 1690 1691 # Construct the answer tuple 1692 results = [] 1693 for c in cols_to_get: 1694 results.append(cols[c]) 1695 return tuple(results)
1696 1697
1698 -def _safe_int(str):
1699 try: 1700 return int(str) 1701 except ValueError: 1702 # Something went wrong. Try to clean up the string. 1703 # Remove all commas from the string 1704 str = str.replace(',', '') 1705 # try again after removing commas. 1706 # Note int() will return a long rather than overflow 1707 try: 1708 return int(str) 1709 except ValueError: 1710 pass 1711 # Call float to handle things like "54.3", note could lose precision, e.g. 1712 # >>> int("5399354557888517312") 1713 # 5399354557888517312 1714 # >>> int(float("5399354557888517312")) 1715 # 5399354557888517120 1716 return int(float(str))
1717 1718
1719 -def _safe_float(str):
1720 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 1721 # float('e-172') does not produce an error on his platform. Thus, 1722 # we need to check the string for this condition. 1723 1724 # Sometimes BLAST leaves of the '1' in front of an exponent. 1725 if str and str[0] in ['E', 'e']: 1726 str = '1' + str 1727 try: 1728 return float(str) 1729 except ValueError: 1730 # Remove all commas from the string 1731 str = str.replace(',', '') 1732 # try again. 1733 return float(str)
1734 1735
1736 -class _BlastErrorConsumer(_BlastConsumer):
1737 - def __init__(self):
1739
1740 - def noevent(self, line):
1741 if 'Query must be at least wordsize' in line: 1742 raise ShortQueryBlastError("Query must be at least wordsize") 1743 # Now pass the line back up to the superclass. 1744 method = getattr(_BlastConsumer, 'noevent', 1745 _BlastConsumer.__getattr__(self, 'noevent')) 1746 method(line)
1747 1748
1749 -class BlastErrorParser(AbstractParser):
1750 """Attempt to catch and diagnose BLAST errors while parsing. 1751 1752 This utilizes the BlastParser module but adds an additional layer 1753 of complexity on top of it by attempting to diagnose ValueErrors 1754 that may actually indicate problems during BLAST parsing. 1755 1756 Current BLAST problems this detects are: 1757 o LowQualityBlastError - When BLASTing really low quality sequences 1758 (ie. some GenBank entries which are just short streches of a single 1759 nucleotide), BLAST will report an error with the sequence and be 1760 unable to search with this. This will lead to a badly formatted 1761 BLAST report that the parsers choke on. The parser will convert the 1762 ValueError to a LowQualityBlastError and attempt to provide useful 1763 information. 1764 """
1765 - def __init__(self, bad_report_handle=None):
1766 """Initialize a parser that tries to catch BlastErrors. 1767 1768 Arguments: 1769 o bad_report_handle - An optional argument specifying a handle 1770 where bad reports should be sent. This would allow you to save 1771 all of the bad reports to a file, for instance. If no handle 1772 is specified, the bad reports will not be saved. 1773 """ 1774 self._bad_report_handle = bad_report_handle 1775 1776 # self._b_parser = BlastParser() 1777 self._scanner = _Scanner() 1778 self._consumer = _BlastErrorConsumer()
1779
1780 - def parse(self, handle):
1781 """Parse a handle, attempting to diagnose errors. 1782 """ 1783 results = handle.read() 1784 1785 try: 1786 self._scanner.feed(StringIO(results), self._consumer) 1787 except ValueError: 1788 # if we have a bad_report_file, save the info to it first 1789 if self._bad_report_handle: 1790 # send the info to the error handle 1791 self._bad_report_handle.write(