Package Bio :: Package Blast :: Module NCBIStandalone
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIStandalone

   1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # Patches by Mike Poidinger to support multiple databases. 
   6  # Updated by Peter Cock in 2007 to do a better job on BLAST 2.2.15 
   7   
   8  """Code for calling standalone BLAST and parsing plain text output (DEPRECATED). 
   9   
  10  Rather than parsing the human readable plain text BLAST output (which seems to 
  11  change with every update to BLAST), we and the NBCI recommend you parse the 
  12  XML output instead. The plain text parser in this module still works at the 
  13  time of writing, but is considered obsolete and updating it to cope with the 
  14  latest versions of BLAST is not a priority for us. 
  15   
  16  This module also provides code to work with the "legacy" standalone version of 
  17  NCBI BLAST, tools blastall, rpsblast and blastpgp via three helper functions of 
  18  the same name. These functions are very limited for dealing with the output as 
  19  files rather than handles, for which the wrappers in Bio.Blast.Applications are 
  20  preferred. Furthermore, the NCBI themselves regard these command line tools as 
  21  "legacy", and encourage using the new BLAST+ tools instead. Biopython has 
  22  wrappers for these under Bio.Blast.Applications (see the tutorial). 
  23   
  24  Classes: 
  25  LowQualityBlastError     Except that indicates low quality query sequences. 
  26  BlastParser              Parses output from blast. 
  27  BlastErrorParser         Parses output and tries to diagnose possible errors. 
  28  PSIBlastParser           Parses output from psi-blast. 
  29  Iterator                 Iterates over a file of blast results. 
  30   
  31  _Scanner                 Scans output from standalone BLAST. 
  32  _BlastConsumer           Consumes output from blast. 
  33  _PSIBlastConsumer        Consumes output from psi-blast. 
  34  _HeaderConsumer          Consumes header information. 
  35  _DescriptionConsumer     Consumes description information. 
  36  _AlignmentConsumer       Consumes alignment information. 
  37  _HSPConsumer             Consumes hsp information. 
  38  _DatabaseReportConsumer  Consumes database report information. 
  39  _ParametersConsumer      Consumes parameters information. 
  40   
  41  Functions: 
  42  blastall        Execute blastall. 
  43  blastpgp        Execute blastpgp. 
  44  rpsblast        Execute rpsblast. 
  45   
  46  For calling the BLAST command line tools, we encourage you to use the 
  47  command line wrappers in Bio.Blast.Applications - the three functions 
  48  blastall, blastpgp and rpsblast are considered to be obsolete now, and 
  49  are likely to be deprecated and then removed in future releases. 
  50  """ 
  51   
  52  from __future__ import print_function 
  53   
  54  from Bio import BiopythonDeprecationWarning 
  55  import warnings 
  56  warnings.warn("This module has been deprecated. Consider Bio.SearchIO for " 
  57                "parsing BLAST output instead.", BiopythonDeprecationWarning) 
  58   
  59  import os 
  60  import re 
  61  from Bio._py3k import StringIO 
  62   
  63  from Bio import File 
  64  from Bio.ParserSupport import * 
  65  from Bio.Blast import Record 
  66  from Bio.Application import _escape_filename 
  67   
  68   
69 -class LowQualityBlastError(Exception):
70 """Error caused by running a low quality sequence through BLAST. 71 72 When low quality sequences (like GenBank entries containing only 73 stretches of a single nucleotide) are BLASTed, they will result in 74 BLAST generating an error and not being able to perform the BLAST. 75 search. This error should be raised for the BLAST reports produced 76 in this case. 77 """ 78 pass
79 80
81 -class ShortQueryBlastError(Exception):
82 """Error caused by running a short query sequence through BLAST. 83 84 If the query sequence is too short, BLAST outputs warnings and errors: 85 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed. 86 [blastall] ERROR: [000.000] AT1G08320: Blast: 87 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize 88 done 89 90 This exception is raised when that condition is detected. 91 92 """ 93 pass
94 95
96 -class _Scanner(object):
97 """Scan BLAST output from blastall or blastpgp. 98 99 Tested with blastall and blastpgp v2.0.10, v2.0.11 100 101 Methods: 102 feed Feed data into the scanner. 103 """
104 - def feed(self, handle, consumer):
105 """S.feed(handle, consumer) 106 107 Feed in a BLAST report for scanning. handle is a file-like 108 object that contains the BLAST report. consumer is a Consumer 109 object that will receive events as the report is scanned. 110 111 """ 112 if isinstance(handle, File.UndoHandle): 113 uhandle = handle 114 else: 115 uhandle = File.UndoHandle(handle) 116 117 # Try to fast-forward to the beginning of the blast report. 118 read_and_call_until(uhandle, consumer.noevent, contains='BLAST') 119 # Now scan the BLAST report. 120 self._scan_header(uhandle, consumer) 121 self._scan_rounds(uhandle, consumer) 122 self._scan_database_report(uhandle, consumer) 123 self._scan_parameters(uhandle, consumer)
124
125 - def _scan_header(self, uhandle, consumer):
126 # BLASTP 2.0.10 [Aug-26-1999] 127 # 128 # 129 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaf 130 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 131 # "Gapped BLAST and PSI-BLAST: a new generation of protein database sea 132 # programs", Nucleic Acids Res. 25:3389-3402. 133 # 134 # Query= test 135 # (140 letters) 136 # 137 # Database: sdqib40-1.35.seg.fa 138 # 1323 sequences; 223,339 total letters 139 # 140 # ======================================================== 141 # This next example is from the online version of Blast, 142 # note there are TWO references, an RID line, and also 143 # the database is BEFORE the query line. 144 # Note there possibleuse of non-ASCII in the author names. 145 # ======================================================== 146 # 147 # BLASTP 2.2.15 [Oct-15-2006] 148 # Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Sch??ffer, 149 # Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman 150 # (1997), "Gapped BLAST and PSI-BLAST: a new generation of 151 # protein database search programs", Nucleic Acids Res. 25:3389-3402. 152 # 153 # Reference: Sch??ffer, Alejandro A., L. Aravind, Thomas L. Madden, Sergei 154 # Shavirin, John L. Spouge, Yuri I. Wolf, Eugene V. Koonin, and 155 # Stephen F. Altschul (2001), "Improving the accuracy of PSI-BLAST 156 # protein database searches with composition-based statistics 157 # and other refinements", Nucleic Acids Res. 29:2994-3005. 158 # 159 # RID: 1166022616-19998-65316425856.BLASTQ1 160 # 161 # 162 # Database: All non-redundant GenBank CDS 163 # translations+PDB+SwissProt+PIR+PRF excluding environmental samples 164 # 4,254,166 sequences; 1,462,033,012 total letters 165 # Query= gi:16127998 166 # Length=428 167 # 168 169 consumer.start_header() 170 171 read_and_call(uhandle, consumer.version, contains='BLAST') 172 read_and_call_while(uhandle, consumer.noevent, blank=1) 173 174 # There might be a <pre> line, for qblast output. 175 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>") 176 177 # Read the reference(s) 178 while attempt_read_and_call(uhandle, 179 consumer.reference, start='Reference'): 180 # References are normally multiline terminated by a blank line 181 # (or, based on the old code, the RID line) 182 while True: 183 line = uhandle.readline() 184 if is_blank_line(line): 185 consumer.noevent(line) 186 break 187 elif line.startswith("RID"): 188 break 189 else: 190 #More of the reference 191 consumer.reference(line) 192 193 #Deal with the optional RID: ... 194 read_and_call_while(uhandle, consumer.noevent, blank=1) 195 attempt_read_and_call(uhandle, consumer.reference, start="RID:") 196 read_and_call_while(uhandle, consumer.noevent, blank=1) 197 198 # blastpgp may have a reference for compositional score matrix 199 # adjustment (see Bug 2502): 200 if attempt_read_and_call( 201 uhandle, consumer.reference, start="Reference"): 202 read_and_call_until(uhandle, consumer.reference, blank=1) 203 read_and_call_while(uhandle, consumer.noevent, blank=1) 204 205 # blastpgp has a Reference for composition-based statistics. 206 if attempt_read_and_call( 207 uhandle, consumer.reference, start="Reference"): 208 read_and_call_until(uhandle, consumer.reference, blank=1) 209 read_and_call_while(uhandle, consumer.noevent, blank=1) 210 211 line = uhandle.peekline() 212 assert line.strip() != "" 213 assert not line.startswith("RID:") 214 if line.startswith("Query="): 215 #This is an old style query then database... 216 217 # Read the Query lines and the following blank line. 218 read_and_call(uhandle, consumer.query_info, start='Query=') 219 read_and_call_until(uhandle, consumer.query_info, blank=1) 220 read_and_call_while(uhandle, consumer.noevent, blank=1) 221 222 # Read the database lines and the following blank line. 223 read_and_call_until(uhandle, consumer.database_info, end='total letters') 224 read_and_call(uhandle, consumer.database_info, contains='sequences') 225 read_and_call_while(uhandle, consumer.noevent, blank=1) 226 elif line.startswith("Database:"): 227 #This is a new style database then query... 228 read_and_call_until(uhandle, consumer.database_info, end='total letters') 229 read_and_call(uhandle, consumer.database_info, contains='sequences') 230 read_and_call_while(uhandle, consumer.noevent, blank=1) 231 232 # Read the Query lines and the following blank line. 233 # Or, on BLAST 2.2.22+ there is no blank link - need to spot 234 # the "... Score E" line instead. 235 read_and_call(uhandle, consumer.query_info, start='Query=') 236 # BLAST 2.2.25+ has a blank line before Length= 237 read_and_call_until(uhandle, consumer.query_info, start='Length=') 238 while True: 239 line = uhandle.peekline() 240 if not line.strip() or "Score E" in line: 241 break 242 #It is more of the query (and its length) 243 read_and_call(uhandle, consumer.query_info) 244 read_and_call_while(uhandle, consumer.noevent, blank=1) 245 else: 246 raise ValueError("Invalid header?") 247 248 consumer.end_header()
249
250 - def _scan_rounds(self, uhandle, consumer):
251 # Scan a bunch of rounds. 252 # Each round begins with either a "Searching......" line 253 # or a 'Score E' line followed by descriptions and alignments. 254 # The email server doesn't give the "Searching....." line. 255 # If there is no 'Searching.....' line then you'll first see a 256 # 'Results from round' line 257 258 while not self._eof(uhandle): 259 line = safe_peekline(uhandle) 260 if not line.startswith('Searching') and \ 261 not line.startswith('Results from round') and \ 262 re.search(r"Score +E", line) is None and \ 263 'No hits found' not in line: 264 break 265 self._scan_descriptions(uhandle, consumer) 266 self._scan_alignments(uhandle, consumer)
267
268 - def _scan_descriptions(self, uhandle, consumer):
269 # Searching..................................................done 270 # Results from round 2 271 # 272 # 273 # Sc 274 # Sequences producing significant alignments: (b 275 # Sequences used in model and found again: 276 # 277 # d1tde_2 3.4.1.4.4 (119-244) Thioredoxin reductase [Escherichia ... 278 # d1tcob_ 1.31.1.5.16 Calcineurin regulatory subunit (B-chain) [B... 279 # d1symb_ 1.31.1.2.2 Calcyclin (S100) [RAT (RATTUS NORVEGICUS)] 280 # 281 # Sequences not found previously or not previously below threshold: 282 # 283 # d1osa__ 1.31.1.5.11 Calmodulin [Paramecium tetraurelia] 284 # d1aoza3 2.5.1.3.3 (339-552) Ascorbate oxidase [zucchini (Cucurb... 285 # 286 287 # If PSI-BLAST, may also have: 288 # 289 # CONVERGED! 290 291 consumer.start_descriptions() 292 293 # Read 'Searching' 294 # This line seems to be missing in BLASTN 2.1.2 (others?) 295 attempt_read_and_call(uhandle, consumer.noevent, start='Searching') 296 297 # blastpgp 2.0.10 from NCBI 9/19/99 for Solaris sometimes crashes here. 298 # If this happens, the handle will yield no more information. 299 if not uhandle.peekline(): 300 raise ValueError("Unexpected end of blast report. " + 301 "Looks suspiciously like a PSI-BLAST crash.") 302 303 # BLASTN 2.2.3 sometimes spews a bunch of warnings and errors here: 304 # Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch 305 # [blastall] ERROR: [000.000] AT1G08320: Blast: 306 # [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at leas 307 # done 308 # Reported by David Weisman. 309 # Check for these error lines and ignore them for now. Let 310 # the BlastErrorParser deal with them. 311 line = uhandle.peekline() 312 if "ERROR:" in line or line.startswith("done"): 313 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:") 314 read_and_call(uhandle, consumer.noevent, start="done") 315 316 # Check to see if this is PSI-BLAST. 317 # If it is, the 'Searching' line will be followed by: 318 # (version 2.0.10) 319 # Searching............................. 320 # Results from round 2 321 # or (version 2.0.11) 322 # Searching............................. 323 # 324 # 325 # Results from round 2 326 327 # Skip a bunch of blank lines. 328 read_and_call_while(uhandle, consumer.noevent, blank=1) 329 # Check for the results line if it's there. 330 if attempt_read_and_call(uhandle, consumer.round, start='Results'): 331 read_and_call_while(uhandle, consumer.noevent, blank=1) 332 333 # Three things can happen here: 334 # 1. line contains 'Score E' 335 # 2. line contains "No hits found" 336 # 3. no descriptions 337 # The first one begins a bunch of descriptions. The last two 338 # indicates that no descriptions follow, and we should go straight 339 # to the alignments. 340 if not attempt_read_and_call( 341 uhandle, consumer.description_header, 342 has_re=re.compile(r'Score +E')): 343 # Either case 2 or 3. Look for "No hits found". 344 attempt_read_and_call(uhandle, consumer.no_hits, 345 contains='No hits found') 346 try: 347 read_and_call_while(uhandle, consumer.noevent, blank=1) 348 except ValueError as err: 349 if str(err) != "Unexpected end of stream.": 350 raise err 351 352 consumer.end_descriptions() 353 # Stop processing. 354 return 355 356 # Read the score header lines 357 read_and_call(uhandle, consumer.description_header, 358 start='Sequences producing') 359 360 # If PSI-BLAST, read the 'Sequences used in model' line. 361 attempt_read_and_call(uhandle, consumer.model_sequences, 362 start='Sequences used in model') 363 read_and_call_while(uhandle, consumer.noevent, blank=1) 364 365 # In BLAT, rather than a "No hits found" line, we just 366 # get no descriptions (and no alignments). This can be 367 # spotted because the next line is the database block: 368 if safe_peekline(uhandle).startswith(" Database:"): 369 consumer.end_descriptions() 370 # Stop processing. 371 return 372 373 # Read the descriptions and the following blank lines, making 374 # sure that there are descriptions. 375 if not uhandle.peekline().startswith('Sequences not found'): 376 read_and_call_until(uhandle, consumer.description, blank=1) 377 read_and_call_while(uhandle, consumer.noevent, blank=1) 378 379 # If PSI-BLAST, read the 'Sequences not found' line followed 380 # by more descriptions. However, I need to watch out for the 381 # case where there were no sequences not found previously, in 382 # which case there will be no more descriptions. 383 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences, 384 start='Sequences not found'): 385 # Read the descriptions and the following blank lines. 386 read_and_call_while(uhandle, consumer.noevent, blank=1) 387 l = safe_peekline(uhandle) 388 # Brad -- added check for QUERY. On some PSI-BLAST outputs 389 # there will be a 'Sequences not found' line followed by no 390 # descriptions. Check for this case since the first thing you'll 391 # get is a blank line and then 'QUERY' 392 if not l.startswith('CONVERGED') and l[0] != '>' \ 393 and not l.startswith('QUERY'): 394 read_and_call_until(uhandle, consumer.description, blank=1) 395 read_and_call_while(uhandle, consumer.noevent, blank=1) 396 397 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED') 398 read_and_call_while(uhandle, consumer.noevent, blank=1) 399 400 consumer.end_descriptions()
401
402 - def _scan_alignments(self, uhandle, consumer):
403 if self._eof(uhandle): 404 return 405 406 # qblast inserts a helpful line here. 407 attempt_read_and_call(uhandle, consumer.noevent, start="ALIGNMENTS") 408 409 # First, check to see if I'm at the database report. 410 line = safe_peekline(uhandle) 411 if not line: 412 #EOF 413 return 414 elif line.startswith(' Database') or line.startswith("Lambda"): 415 return 416 elif line[0] == '>': 417 # XXX make a better check here between pairwise and masterslave 418 self._scan_pairwise_alignments(uhandle, consumer) 419 elif line.startswith('Effective'): 420 return 421 else: 422 # XXX put in a check to make sure I'm in a masterslave alignment 423 self._scan_masterslave_alignment(uhandle, consumer)
424
425 - def _scan_pairwise_alignments(self, uhandle, consumer):
426 while not self._eof(uhandle): 427 line = safe_peekline(uhandle) 428 if line[0] != '>': 429 break 430 self._scan_one_pairwise_alignment(uhandle, consumer)
431
432 - def _scan_one_pairwise_alignment(self, uhandle, consumer):
433 if self._eof(uhandle): 434 return 435 consumer.start_alignment() 436 437 self._scan_alignment_header(uhandle, consumer) 438 439 # Scan a bunch of score/alignment pairs. 440 while True: 441 if self._eof(uhandle): 442 #Shouldn't have issued that _scan_alignment_header event... 443 break 444 line = safe_peekline(uhandle) 445 if not line.startswith(' Score'): 446 break 447 self._scan_hsp(uhandle, consumer) 448 consumer.end_alignment()
449
450 - def _scan_alignment_header(self, uhandle, consumer):
451 # >d1rip__ 2.24.7.1.1 Ribosomal S17 protein [Bacillus 452 # stearothermophilus] 453 # Length = 81 454 # 455 # Or, more recently with different white space: 456 # 457 # >gi|15799684|ref|NP_285696.1| threonine synthase ... 458 # gi|15829258|ref|NP_308031.1| threonine synthase 459 # ... 460 # Length=428 461 read_and_call(uhandle, consumer.title, start='>') 462 while True: 463 line = safe_readline(uhandle) 464 if line.lstrip().startswith('Length =') \ 465 or line.lstrip().startswith('Length='): 466 consumer.length(line) 467 break 468 elif is_blank_line(line): 469 # Check to make sure I haven't missed the Length line 470 raise ValueError("I missed the Length in an alignment header") 471 consumer.title(line) 472 473 # Older versions of BLAST will have a line with some spaces. 474 # Version 2.0.14 (maybe 2.0.13?) and above print a true blank line. 475 if not attempt_read_and_call(uhandle, consumer.noevent, 476 start=' '): 477 read_and_call(uhandle, consumer.noevent, blank=1)
478
479 - def _scan_hsp(self, uhandle, consumer):
480 consumer.start_hsp() 481 self._scan_hsp_header(uhandle, consumer) 482 self._scan_hsp_alignment(uhandle, consumer) 483 consumer.end_hsp()
484
485 - def _scan_hsp_header(self, uhandle, consumer):
486 # Score = 22.7 bits (47), Expect = 2.5 487 # Identities = 10/36 (27%), Positives = 18/36 (49%) 488 # Strand = Plus / Plus 489 # Frame = +3 490 # 491 492 read_and_call(uhandle, consumer.score, start=' Score') 493 read_and_call(uhandle, consumer.identities, start=' Identities') 494 # BLASTN 495 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand') 496 # BLASTX, TBLASTN, TBLASTX 497 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame') 498 read_and_call(uhandle, consumer.noevent, blank=1)
499
500 - def _scan_hsp_alignment(self, uhandle, consumer):
501 # Query: 11 GRGVSACA-------TCDGFFYRNQKVAVIGGGNTAVEEALYLSNIASEVHLIHRRDGF 502 # GRGVS+ TC Y + + V GGG+ + EE L + I R+ 503 # Sbjct: 12 GRGVSSVVRRCIHKPTCKE--YAVKIIDVTGGGSFSAEEVQELREATLKEVDILRKVSG 504 # 505 # Query: 64 AEKILIKR 71 506 # I +K 507 # Sbjct: 70 PNIIQLKD 77 508 # 509 510 while True: 511 # Blastn adds an extra line filled with spaces before Query 512 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 513 read_and_call(uhandle, consumer.query, start='Query') 514 read_and_call(uhandle, consumer.align, start=' ') 515 read_and_call(uhandle, consumer.sbjct, start='Sbjct') 516 try: 517 read_and_call_while(uhandle, consumer.noevent, blank=1) 518 except ValueError as err: 519 if str(err) != "Unexpected end of stream.": 520 raise err 521 # End of File (well, it looks like it with recent versions 522 # of BLAST for multiple queries after the Iterator class 523 # has broken up the whole file into chunks). 524 break 525 line = safe_peekline(uhandle) 526 # Alignment continues if I see a 'Query' or the spaces for Blastn. 527 if not (line.startswith('Query') or line.startswith(' ')): 528 break
529
530 - def _scan_masterslave_alignment(self, uhandle, consumer):
531 consumer.start_alignment() 532 while True: 533 line = safe_readline(uhandle) 534 # Check to see whether I'm finished reading the alignment. 535 # This is indicated by 1) database section, 2) next psi-blast 536 # round, which can also be a 'Results from round' if no 537 # searching line is present 538 # patch by chapmanb 539 if line.startswith('Searching') or \ 540 line.startswith('Results from round'): 541 uhandle.saveline(line) 542 break 543 elif line.startswith(' Database'): 544 uhandle.saveline(line) 545 break 546 elif is_blank_line(line): 547 consumer.noevent(line) 548 else: 549 consumer.multalign(line) 550 read_and_call_while(uhandle, consumer.noevent, blank=1) 551 consumer.end_alignment()
552
553 - def _eof(self, uhandle):
554 try: 555 line = safe_peekline(uhandle) 556 except ValueError as err: 557 if str(err) != "Unexpected end of stream.": 558 raise err 559 line = "" 560 return not line
561
562 - def _scan_database_report(self, uhandle, consumer):
563 # Database: sdqib40-1.35.seg.fa 564 # Posted date: Nov 1, 1999 4:25 PM 565 # Number of letters in database: 223,339 566 # Number of sequences in database: 1323 567 # 568 # Lambda K H 569 # 0.322 0.133 0.369 570 # 571 # Gapped 572 # Lambda K H 573 # 0.270 0.0470 0.230 574 # 575 ########################################## 576 # Or, more recently Blast 2.2.15 gives less blank lines 577 ########################################## 578 # Database: All non-redundant GenBank CDS translations+PDB+SwissProt+PIR+PRF excluding 579 # environmental samples 580 # Posted date: Dec 12, 2006 5:51 PM 581 # Number of letters in database: 667,088,753 582 # Number of sequences in database: 2,094,974 583 # Lambda K H 584 # 0.319 0.136 0.395 585 # Gapped 586 # Lambda K H 587 # 0.267 0.0410 0.140 588 589 if self._eof(uhandle): 590 return 591 592 consumer.start_database_report() 593 594 # Subset of the database(s) listed below 595 # Number of letters searched: 562,618,960 596 # Number of sequences searched: 228,924 597 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"): 598 read_and_call(uhandle, consumer.noevent, contains="letters") 599 read_and_call(uhandle, consumer.noevent, contains="sequences") 600 read_and_call(uhandle, consumer.noevent, start=" ") 601 602 # Sameet Mehta reported seeing output from BLASTN 2.2.9 that 603 # was missing the "Database" stanza completely. 604 while attempt_read_and_call(uhandle, consumer.database, 605 start=' Database'): 606 # BLAT output ends abruptly here, without any of the other 607 # information. Check to see if this is the case. If so, 608 # then end the database report here gracefully. 609 if not uhandle.peekline().strip() \ 610 or uhandle.peekline().startswith("BLAST"): 611 consumer.end_database_report() 612 return 613 614 # Database can span multiple lines. 615 read_and_call_until(uhandle, consumer.database, start=' Posted') 616 read_and_call(uhandle, consumer.posted_date, start=' Posted') 617 read_and_call(uhandle, consumer.num_letters_in_database, 618 start=' Number of letters') 619 read_and_call(uhandle, consumer.num_sequences_in_database, 620 start=' Number of sequences') 621 #There may not be a line starting with spaces... 622 attempt_read_and_call(uhandle, consumer.noevent, start=' ') 623 624 line = safe_readline(uhandle) 625 uhandle.saveline(line) 626 if 'Lambda' in line: 627 break 628 629 try: 630 read_and_call(uhandle, consumer.noevent, start='Lambda') 631 read_and_call(uhandle, consumer.ka_params) 632 except: 633 pass 634 635 #This blank line is optional: 636 attempt_read_and_call(uhandle, consumer.noevent, blank=1) 637 638 # not BLASTP 639 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped') 640 # not TBLASTX 641 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'): 642 read_and_call(uhandle, consumer.ka_params_gap) 643 644 # Blast 2.2.4 can sometimes skip the whole parameter section. 645 # Thus, I need to be careful not to read past the end of the 646 # file. 647 try: 648 read_and_call_while(uhandle, consumer.noevent, blank=1) 649 except ValueError as x: 650 if str(x) != "Unexpected end of stream.": 651 raise 652 consumer.end_database_report()
653
654 - def _scan_parameters(self, uhandle, consumer):
655 # Matrix: BLOSUM62 656 # Gap Penalties: Existence: 11, Extension: 1 657 # Number of Hits to DB: 50604 658 # Number of Sequences: 1323 659 # Number of extensions: 1526 660 # Number of successful extensions: 6 661 # Number of sequences better than 10.0: 5 662 # Number of HSP's better than 10.0 without gapping: 5 663 # Number of HSP's successfully gapped in prelim test: 0 664 # Number of HSP's that attempted gapping in prelim test: 1 665 # Number of HSP's gapped (non-prelim): 5 666 # length of query: 140 667 # length of database: 223,339 668 # effective HSP length: 39 669 # effective length of query: 101 670 # effective length of database: 171,742 671 # effective search space: 17345942 672 # effective search space used: 17345942 673 # T: 11 674 # A: 40 675 # X1: 16 ( 7.4 bits) 676 # X2: 38 (14.8 bits) 677 # X3: 64 (24.9 bits) 678 # S1: 41 (21.9 bits) 679 # S2: 42 (20.8 bits) 680 ########################################## 681 # Or, more recently Blast(x) 2.2.15 gives 682 ########################################## 683 # Matrix: BLOSUM62 684 # Gap Penalties: Existence: 11, Extension: 1 685 # Number of Sequences: 4535438 686 # Number of Hits to DB: 2,588,844,100 687 # Number of extensions: 60427286 688 # Number of successful extensions: 126433 689 # Number of sequences better than 2.0: 30 690 # Number of HSP's gapped: 126387 691 # Number of HSP's successfully gapped: 35 692 # Length of query: 291 693 # Length of database: 1,573,298,872 694 # Length adjustment: 130 695 # Effective length of query: 161 696 # Effective length of database: 983,691,932 697 # Effective search space: 158374401052 698 # Effective search space used: 158374401052 699 # Neighboring words threshold: 12 700 # Window for multiple hits: 40 701 # X1: 16 ( 7.3 bits) 702 # X2: 38 (14.6 bits) 703 # X3: 64 (24.7 bits) 704 # S1: 41 (21.7 bits) 705 # S2: 32 (16.9 bits) 706 707 # Blast 2.2.4 can sometimes skip the whole parameter section. 708 # BLAT also skips the whole parameter section. 709 # Thus, check to make sure that the parameter section really 710 # exists. 711 if not uhandle.peekline().strip(): 712 return 713 714 # BLASTN 2.2.9 looks like it reverses the "Number of Hits" and 715 # "Number of Sequences" lines. 716 consumer.start_parameters() 717 718 # Matrix line may be missing in BLASTN 2.2.9 719 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix') 720 # not TBLASTX 721 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap') 722 723 attempt_read_and_call(uhandle, consumer.num_sequences, 724 start='Number of Sequences') 725 attempt_read_and_call(uhandle, consumer.num_hits, 726 start='Number of Hits') 727 attempt_read_and_call(uhandle, consumer.num_sequences, 728 start='Number of Sequences') 729 attempt_read_and_call(uhandle, consumer.num_extends, 730 start='Number of extensions') 731 attempt_read_and_call(uhandle, consumer.num_good_extends, 732 start='Number of successful') 733 734 attempt_read_and_call(uhandle, consumer.num_seqs_better_e, 735 start='Number of sequences') 736 737 # not BLASTN, TBLASTX 738 if attempt_read_and_call(uhandle, consumer.hsps_no_gap, 739 start="Number of HSP's better"): 740 # BLASTN 2.2.9 741 if attempt_read_and_call(uhandle, consumer.noevent, 742 start="Number of HSP's gapped:"): 743 read_and_call(uhandle, consumer.noevent, 744 start="Number of HSP's successfully") 745 #This is omitted in 2.2.15 746 attempt_read_and_call(uhandle, consumer.noevent, 747 start="Number of extra gapped extensions") 748 else: 749 read_and_call(uhandle, consumer.hsps_prelim_gapped, 750 start="Number of HSP's successfully") 751 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted, 752 start="Number of HSP's that") 753 read_and_call(uhandle, consumer.hsps_gapped, 754 start="Number of HSP's gapped") 755 #e.g. BLASTX 2.2.15 where the "better" line is missing 756 elif attempt_read_and_call(uhandle, consumer.noevent, 757 start="Number of HSP's gapped"): 758 read_and_call(uhandle, consumer.noevent, 759 start="Number of HSP's successfully") 760 761 # not in blastx 2.2.1 762 attempt_read_and_call(uhandle, consumer.query_length, 763 has_re=re.compile(r"[Ll]ength of query")) 764 # Not in BLASTX 2.2.22+ 765 attempt_read_and_call(uhandle, consumer.database_length, 766 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase")) 767 768 # BLASTN 2.2.9 769 attempt_read_and_call(uhandle, consumer.noevent, 770 start="Length adjustment") 771 attempt_read_and_call(uhandle, consumer.effective_hsp_length, 772 start='effective HSP') 773 # Not in blastx 2.2.1 774 attempt_read_and_call( 775 uhandle, consumer.effective_query_length, 776 has_re=re.compile(r'[Ee]ffective length of query')) 777 778 # This is not in BLASTP 2.2.15 779 attempt_read_and_call( 780 uhandle, consumer.effective_database_length, 781 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase')) 782 # Not in blastx 2.2.1, added a ':' to distinguish between 783 # this and the 'effective search space used' line 784 attempt_read_and_call( 785 uhandle, consumer.effective_search_space, 786 has_re=re.compile(r'[Ee]ffective search space:')) 787 # Does not appear in BLASTP 2.0.5 788 attempt_read_and_call( 789 uhandle, consumer.effective_search_space_used, 790 has_re=re.compile(r'[Ee]ffective search space used')) 791 792 # BLASTX, TBLASTN, TBLASTX 793 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift') 794 795 # not in BLASTN 2.2.9 796 attempt_read_and_call(uhandle, consumer.threshold, start='T') 797 # In BLASTX 2.2.15 replaced by: "Neighboring words threshold: 12" 798 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold') 799 800 # not in BLASTX 2.2.15 801 attempt_read_and_call(uhandle, consumer.window_size, start='A') 802 # get this instead: "Window for multiple hits: 40" 803 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits') 804 805 # not in BLASTX 2.2.22+ 806 attempt_read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1') 807 # not TBLASTN 808 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2') 809 810 # not BLASTN, TBLASTX 811 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final, 812 start='X3') 813 814 # not TBLASTN 815 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1') 816 # not in blastx 2.2.1 817 # first we make sure we have additional lines to work with, if 818 # not then the file is done and we don't have a final S2 819 if not is_blank_line(uhandle.peekline(), allow_spaces=1): 820 read_and_call(uhandle, consumer.blast_cutoff, start='S2') 821 822 consumer.end_parameters()
823 824
825 -class BlastParser(AbstractParser):
826 """Parses BLAST data into a Record.Blast object. 827 828 """
829 - def __init__(self):
830 """__init__(self)""" 831 self._scanner = _Scanner() 832 self._consumer = _BlastConsumer()
833
834 - def parse(self, handle):
835 """parse(self, handle)""" 836 self._scanner.feed(handle, self._consumer) 837 return self._consumer.data
838 839
840 -class PSIBlastParser(AbstractParser):
841 """Parses BLAST data into a Record.PSIBlast object. 842 843 """
844 - def __init__(self):
845 """__init__(self)""" 846 self._scanner = _Scanner() 847 self._consumer = _PSIBlastConsumer()
848
849 - def parse(self, handle):
850 """parse(self, handle)""" 851 self._scanner.feed(handle, self._consumer) 852 return self._consumer.data
853 854
855 -class _HeaderConsumer(object):
856 - def start_header(self):
857 self._header = Record.Header()
858
859 - def version(self, line):
860 c = line.split() 861 self._header.application = c[0] 862 self._header.version = c[1] 863 if len(c) > 2: 864 #The date is missing in the new C++ output from blastx 2.2.22+ 865 #Just get "BLASTX 2.2.22+\n" and that's all. 866 self._header.date = c[2][1:-1]
867
868 - def reference(self, line):
869 if line.startswith('Reference: '): 870 self._header.reference = line[11:] 871 else: 872 self._header.reference = self._header.reference + line
873
874 - def query_info(self, line):
875 if line.startswith('Query= '): 876 self._header.query = line[7:].lstrip() 877 elif line.startswith('Length='): 878 #New style way to give the query length in BLAST 2.2.22+ (the C++ code) 879 self._header.query_letters = _safe_int(line[7:].strip()) 880 elif not line.startswith(' '): # continuation of query_info 881 self._header.query = "%s%s" % (self._header.query, line) 882 else: 883 #Hope it is the old style way to give the query length: 884 letters, = _re_search( 885 r"([0-9,]+) letters", line, 886 "I could not find the number of letters in line\n%s" % line) 887 self._header.query_letters = _safe_int(letters)
888
889 - def database_info(self, line):
890 line = line.rstrip() 891 if line.startswith('Database: '): 892 self._header.database = line[10:] 893 elif not line.endswith('total letters'): 894 if self._header.database: 895 #Need to include a space when merging multi line datase descr 896 self._header.database = self._header.database + " " + line.strip() 897 else: 898 self._header.database = line.strip() 899 else: 900 sequences, letters =_re_search( 901 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line, 902 "I could not find the sequences and letters in line\n%s" %line) 903 self._header.database_sequences = _safe_int(sequences) 904 self._header.database_letters = _safe_int(letters)
905
906 - def end_header(self):
907 # Get rid of the trailing newlines 908 self._header.reference = self._header.reference.rstrip() 909 self._header.query = self._header.query.rstrip()
910 911
912 -class _DescriptionConsumer(object):
913 - def start_descriptions(self):
914 self._descriptions = [] 915 self._model_sequences = [] 916 self._nonmodel_sequences = [] 917 self._converged = 0 918 self._type = None 919 self._roundnum = None 920 921 self.__has_n = 0 # Does the description line contain an N value?
922
923 - def description_header(self, line):
924 if line.startswith('Sequences producing'): 925 cols = line.split() 926 if cols[-1] == 'N': 927 self.__has_n = 1
928
929 - def description(self, line):
930 dh = self._parse(line) 931 if self._type == 'model': 932 self._model_sequences.append(dh) 933 elif self._type == 'nonmodel': 934 self._nonmodel_sequences.append(dh) 935 else: 936 self._descriptions.append(dh)
937
938 - def model_sequences(self, line):
939 self._type = 'model'
940
941 - def nonmodel_sequences(self, line):
942 self._type = 'nonmodel'
943
944 - def converged(self, line):
945 self._converged = 1
946
947 - def no_hits(self, line):
948 pass
949
950 - def round(self, line):
951 if not line.startswith('Results from round'): 952 raise ValueError("I didn't understand the round line\n%s" % line) 953 self._roundnum = _safe_int(line[18:].strip())
954
955 - def end_descriptions(self):
956 pass
957
958 - def _parse(self, description_line):
959 line = description_line # for convenience 960 dh = Record.Description() 961 962 # I need to separate the score and p-value from the title. 963 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 964 # sp|P21297|FLBT_CAUCR FLBT PROTEIN [snip] 284 7e-77 1 965 # special cases to handle: 966 # - title must be preserved exactly (including whitespaces) 967 # - score could be equal to e-value (not likely, but what if??) 968 # - sometimes there's an "N" score of '1'. 969 cols = line.split() 970 if len(cols) < 3: 971 raise ValueError( 972 "Line does not appear to contain description:\n%s" % line) 973 if self.__has_n: 974 i = line.rfind(cols[-1]) # find start of N 975 i = line.rfind(cols[-2], 0, i) # find start of p-value 976 i = line.rfind(cols[-3], 0, i) # find start of score 977 else: 978 i = line.rfind(cols[-1]) # find start of p-value 979 i = line.rfind(cols[-2], 0, i) # find start of score 980 if self.__has_n: 981 dh.title, dh.score, dh.e, dh.num_alignments = \ 982 line[:i].rstrip(), cols[-3], cols[-2], cols[-1] 983 else: 984 dh.title, dh.score, dh.e, dh.num_alignments = \ 985 line[:i].rstrip(), cols[-2], cols[-1], 1 986 dh.num_alignments = _safe_int(dh.num_alignments) 987 dh.score = _safe_int(dh.score) 988 dh.e = _safe_float(dh.e) 989 return dh
990 991
992 -class _AlignmentConsumer(object):
993 # This is a little bit tricky. An alignment can either be a 994 # pairwise alignment or a multiple alignment. Since it's difficult 995 # to know a-priori which one the blast record will contain, I'm going 996 # to make one class that can parse both of them.
997 - def start_alignment(self):
998 self._alignment = Record.Alignment() 999 self._multiple_alignment = Record.MultipleAlignment()
1000
1001 - def title(self, line):
1002 if self._alignment.title: 1003 self._alignment.title += " " 1004 self._alignment.title += line.strip()
1005
1006 - def length(self, line):
1007 #e.g. "Length = 81" or more recently, "Length=428" 1008 parts = line.replace(" ", "").split("=") 1009 assert len(parts)==2, "Unrecognised format length line" 1010 self._alignment.length = parts[1] 1011 self._alignment.length = _safe_int(self._alignment.length)
1012
1013 - def multalign(self, line):
1014 # Standalone version uses 'QUERY', while WWW version uses blast_tmp. 1015 if line.startswith('QUERY') or line.startswith('blast_tmp'): 1016 # If this is the first line of the multiple alignment, 1017 # then I need to figure out how the line is formatted. 1018 1019 # Format of line is: 1020 # QUERY 1 acttg...gccagaggtggtttattcagtctccataagagaggggacaaacg 60 1021 try: 1022 name, start, seq, end = line.split() 1023 except ValueError: 1024 raise ValueError("I do not understand the line\n%s" % line) 1025 self._start_index = line.index(start, len(name)) 1026 self._seq_index = line.index(seq, 1027 self._start_index+len(start)) 1028 # subtract 1 for the space 1029 self._name_length = self._start_index - 1 1030 self._start_length = self._seq_index - self._start_index - 1 1031 self._seq_length = line.rfind(end) - self._seq_index - 1 1032 1033 #self._seq_index = line.index(seq) 1034 ## subtract 1 for the space 1035 #self._seq_length = line.rfind(end) - self._seq_index - 1 1036 #self._start_index = line.index(start) 1037 #self._start_length = self._seq_index - self._start_index - 1 1038 #self._name_length = self._start_index 1039 1040 # Extract the information from the line 1041 name = line[:self._name_length] 1042 name = name.rstrip() 1043 start = line[self._start_index:self._start_index+self._start_length] 1044 start = start.rstrip() 1045 if start: 1046 start = _safe_int(start) 1047 end = line[self._seq_index+self._seq_length:].rstrip() 1048 if end: 1049 end = _safe_int(end) 1050 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip() 1051 # right pad the sequence with spaces if necessary 1052 if len(seq) < self._seq_length: 1053 seq = seq + ' '*(self._seq_length-len(seq)) 1054 1055 # I need to make sure the sequence is aligned correctly with the query. 1056 # First, I will find the length of the query. Then, if necessary, 1057 # I will pad my current sequence with spaces so that they will line 1058 # up correctly. 1059 1060 # Two possible things can happen: 1061 # QUERY 1062 # 504 1063 # 1064 # QUERY 1065 # 403 1066 # 1067 # Sequence 504 will need padding at the end. Since I won't know 1068 # this until the end of the alignment, this will be handled in 1069 # end_alignment. 1070 # Sequence 403 will need padding before being added to the alignment. 1071 1072 align = self._multiple_alignment.alignment # for convenience 1073 align.append((name, start, seq, end))
1074 1075 # This is old code that tried to line up all the sequences 1076 # in a multiple alignment by using the sequence title's as 1077 # identifiers. The problem with this is that BLAST assigns 1078 # different HSP's from the same sequence the same id. Thus, 1079 # in one alignment block, there may be multiple sequences with 1080 # the same id. I'm not sure how to handle this, so I'm not 1081 # going to. 1082 1083 # # If the sequence is the query, then just add it. 1084 # if name == 'QUERY': 1085 # if len(align) == 0: 1086 # align.append((name, start, seq)) 1087 # else: 1088 # aname, astart, aseq = align[0] 1089 # if name != aname: 1090 # raise ValueError, "Query is not the first sequence" 1091 # aseq = aseq + seq 1092 # align[0] = aname, astart, aseq 1093 # else: 1094 # if len(align) == 0: 1095 # raise ValueError, "I could not find the query sequence" 1096 # qname, qstart, qseq = align[0] 1097 # 1098 # # Now find my sequence in the multiple alignment. 1099 # for i in range(1, len(align)): 1100 # aname, astart, aseq = align[i] 1101 # if name == aname: 1102 # index = i 1103 # break 1104 # else: 1105 # # If I couldn't find it, then add a new one. 1106 # align.append((None, None, None)) 1107 # index = len(align)-1 1108 # # Make sure to left-pad it. 1109 # aname, astart, aseq = name, start, ' '*(len(qseq)-len(seq)) 1110 # 1111 # if len(qseq) != len(aseq) + len(seq): 1112 # # If my sequences are shorter than the query sequence, 1113 # # then I will need to pad some spaces to make them line up. 1114 # # Since I've already right padded seq, that means aseq 1115 # # must be too short. 1116 # aseq = aseq + ' '*(len(qseq)-len(aseq)-len(seq)) 1117 # aseq = aseq + seq 1118 # if astart is None: 1119 # astart = start 1120 # align[index] = aname, astart, aseq 1121
1122 - def end_alignment(self):
1123 # Remove trailing newlines 1124 if self._alignment: 1125 self._alignment.title = self._alignment.title.rstrip() 1126 1127 # This code is also obsolete. See note above. 1128 # If there's a multiple alignment, I will need to make sure 1129 # all the sequences are aligned. That is, I may need to 1130 # right-pad the sequences. 1131 # if self._multiple_alignment is not None: 1132 # align = self._multiple_alignment.alignment 1133 # seqlen = None 1134 # for i in range(len(align)): 1135 # name, start, seq = align[i] 1136 # if seqlen is None: 1137 # seqlen = len(seq) 1138 # else: 1139 # if len(seq) < seqlen: 1140 # seq = seq + ' '*(seqlen - len(seq)) 1141 # align[i] = name, start, seq 1142 # elif len(seq) > seqlen: 1143 # raise ValueError, \ 1144 # "Sequence %s is longer than the query" % name 1145 1146 # Clean up some variables, if they exist. 1147 try: 1148 del self._seq_index 1149 del self._seq_length 1150 del self._start_index 1151 del self._start_length 1152 del self._name_length 1153 except AttributeError: 1154 pass
1155 1156
1157 -class _HSPConsumer(object):
1158 - def start_hsp(self):
1159 self._hsp = Record.HSP()
1160
1161 - def score(self, line):
1162 self._hsp.bits, self._hsp.score = _re_search( 1163 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line, 1164 "I could not find the score in line\n%s" % line) 1165 self._hsp.score = _safe_float(self._hsp.score) 1166 self._hsp.bits = _safe_float(self._hsp.bits) 1167 1168 x, y = _re_search( 1169 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line, 1170 "I could not find the expect in line\n%s" % line) 1171 if x: 1172 self._hsp.num_alignments = _safe_int(x) 1173 else: 1174 self._hsp.num_alignments = 1 1175 self._hsp.expect = _safe_float(y)
1176
1177 - def identities(self, line):
1178 x, y = _re_search( 1179 r"Identities = (\d+)\/(\d+)", line, 1180 "I could not find the identities in line\n%s" % line) 1181 self._hsp.identities = _safe_int(x), _safe_int(y) 1182 self._hsp.align_length = _safe_int(y) 1183 1184 if 'Positives' in line: 1185 x, y = _re_search( 1186 r"Positives = (\d+)\/(\d+)", line, 1187 "I could not find the positives in line\n%s" % line) 1188 self._hsp.positives = _safe_int(x), _safe_int(y) 1189 assert self._hsp.align_length == _safe_int(y) 1190 1191 if 'Gaps' in line: 1192 x, y = _re_search( 1193 r"Gaps = (\d+)\/(\d+)", line, 1194 "I could not find the gaps in line\n%s" % line) 1195 self._hsp.gaps = _safe_int(x), _safe_int(y) 1196 assert self._hsp.align_length == _safe_int(y)
1197
1198 - def strand(self, line):
1199 self._hsp.strand = _re_search( 1200 r"Strand\s?=\s?(\w+)\s?/\s?(\w+)", line, 1201 "I could not find the strand in line\n%s" % line)
1202
1203 - def frame(self, line):
1204 # Frame can be in formats: 1205 # Frame = +1 1206 # Frame = +2 / +2 1207 if '/' in line: 1208 self._hsp.frame = _re_search( 1209 r"Frame\s?=\s?([-+][123])\s?/\s?([-+][123])", line, 1210 "I could not find the frame in line\n%s" % line) 1211 else: 1212 self._hsp.frame = _re_search( 1213 r"Frame = ([-+][123])", line, 1214 "I could not find the frame in line\n%s" % line)
1215 1216 # Match a space, if one is available. Masahir Ishikawa found a 1217 # case where there's no space between the start and the sequence: 1218 # Query: 100tt 101 1219 # line below modified by Yair Benita, Sep 2004 1220 # Note that the colon is not always present. 2006 1221 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)") 1222
1223 - def query(self, line):
1224 m = self._query_re.search(line) 1225 if m is None: 1226 raise ValueError("I could not find the query in line\n%s" % line) 1227 1228 # line below modified by Yair Benita, Sep 2004. 1229 # added the end attribute for the query 1230 colon, start, seq, end = m.groups() 1231 self._hsp.query = self._hsp.query + seq 1232 if self._hsp.query_start is None: 1233 self._hsp.query_start = _safe_int(start) 1234 1235 # line below added by Yair Benita, Sep 2004. 1236 # added the end attribute for the query 1237 self._hsp.query_end = _safe_int(end) 1238 1239 #Get index for sequence start (regular expression element 3) 1240 self._query_start_index = m.start(3) 1241 self._query_len = len(seq)
1242
1243 - def align(self, line):
1244 seq = line[self._query_start_index:].rstrip() 1245 if len(seq) < self._query_len: 1246 # Make sure the alignment is the same length as the query 1247 seq = seq + ' ' * (self._query_len-len(seq)) 1248 elif len(seq) < self._query_len: 1249 raise ValueError("Match is longer than the query in line\n%s" 1250 % line) 1251 self._hsp.match = self._hsp.match + seq
1252 1253 # To match how we do the query, cache the regular expression. 1254 # Note that the colon is not always present. 1255 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)") 1256
1257 - def sbjct(self, line):
1258 m = self._sbjct_re.search(line) 1259 if m is None: 1260 raise ValueError("I could not find the sbjct in line\n%s" % line) 1261 colon, start, seq, end = m.groups() 1262 #mikep 26/9/00 1263 #On occasion, there is a blast hit with no subject match 1264 #so far, it only occurs with 1-line short "matches" 1265 #I have decided to let these pass as they appear 1266 if not seq.strip(): 1267 seq = ' ' * self._query_len 1268 self._hsp.sbjct = self._hsp.sbjct + seq 1269 if self._hsp.sbjct_start is None: 1270 self._hsp.sbjct_start = _safe_int(start) 1271 1272 self._hsp.sbjct_end = _safe_int(end) 1273 if len(seq) != self._query_len: 1274 raise ValueError( 1275 "QUERY and SBJCT sequence lengths don't match in line\n%s" 1276 % line) 1277 1278 del self._query_start_index # clean up unused variables 1279 del self._query_len
1280
1281 - def end_hsp(self):
1282 pass
1283 1284
1285 -class _DatabaseReportConsumer(object):
1286
1287 - def start_database_report(self):
1288 self._dr = Record.DatabaseReport()
1289
1290 - def database(self, line):
1291 m = re.search(r"Database: (.+)$", line) 1292 if m: 1293 self._dr.database_name.append(m.group(1)) 1294 elif self._dr.database_name: 1295 # This must be a continuation of the previous name. 1296 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1], 1297 line.strip())
1298
1299 - def posted_date(self, line):
1300 self._dr.posted_date.append(_re_search( 1301 r"Posted date:\s*(.+)$", line, 1302 "I could not find the posted date in line\n%s" % line))
1303
1304 - def num_letters_in_database(self, line):
1305 letters, = _get_cols( 1306 line, (-1,), ncols=6, expected={2:"letters", 4:"database:"}) 1307 self._dr.num_letters_in_database.append(_safe_int(letters))
1308
1309 - def num_sequences_in_database(self, line):
1310 sequences, = _get_cols( 1311 line, (-1,), ncols=6, expected={2:"sequences", 4:"database:"}) 1312 self._dr.num_sequences_in_database.append(_safe_int(sequences))
1313
1314 - def ka_params(self, line):
1315 self._dr.ka_params = [_safe_float(x) for x in line.split()]
1316
1317 - def gapped(self, line):
1318 self._dr.gapped = 1
1319
1320 - def ka_params_gap(self, line):
1321 self._dr.ka_params_gap = [_safe_float(x) for x in line.split()]
1322
1323 - def end_database_report(self):
1324 pass
1325 1326
1327 -class _ParametersConsumer(object):
1328 - def start_parameters(self):
1329 self._params = Record.Parameters()
1330
1331 - def matrix(self, line):
1332 self._params.matrix = line[8:].rstrip()
1333
1334 - def gap_penalties(self, line):
1335 self._params.gap_penalties = [_safe_float(x) for x in _get_cols( 1336 line, (3, 5), ncols=6, expected={2:"Existence:", 4:"Extension:"})]
1337
1338 - def num_hits(self, line):
1339 if '1st pass' in line: 1340 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"}) 1341 self._params.num_hits = _safe_int(x) 1342 else: 1343 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"}) 1344 self._params.num_hits = _safe_int(x)
1345
1346 - def num_sequences(self, line):
1347 if '1st pass' in line: 1348 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"}) 1349 self._params.num_sequences = _safe_int(x) 1350 else: 1351 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"}) 1352 self._params.num_sequences = _safe_int(x)
1353
1354 - def num_extends(self, line):
1355 if '1st pass' in line: 1356 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"}) 1357 self._params.num_extends = _safe_int(x) 1358 else: 1359 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"}) 1360 self._params.num_extends = _safe_int(x)
1361
1362 - def num_good_extends(self, line):
1363 if '1st pass' in line: 1364 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"}) 1365 self._params.num_good_extends = _safe_int(x) 1366 else: 1367 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"}) 1368 self._params.num_good_extends = _safe_int(x)
1369
1370 - def num_seqs_better_e(self, line):
1371 self._params.num_seqs_better_e, = _get_cols( 1372 line, (-1,), ncols=7, expected={2:"sequences"}) 1373 self._params.num_seqs_better_e = _safe_int( 1374 self._params.num_seqs_better_e)
1375
1376 - def hsps_no_gap(self, line):
1377 self._params.hsps_no_gap, = _get_cols( 1378 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"}) 1379 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1380
1381 - def hsps_prelim_gapped(self, line):
1382 self._params.hsps_prelim_gapped, = _get_cols( 1383 line, (-1,), ncols=9, expected={4:"gapped", 6:"prelim"}) 1384 self._params.hsps_prelim_gapped = _safe_int( 1385 self._params.hsps_prelim_gapped)
1386
1387 - def hsps_prelim_gapped_attempted(self, line):
1388 self._params.hsps_prelim_gapped_attempted, = _get_cols( 1389 line, (-1,), ncols=10, expected={4:"attempted", 7:"prelim"}) 1390 self._params.hsps_prelim_gapped_attempted = _safe_int( 1391 self._params.hsps_prelim_gapped_attempted)
1392
1393 - def hsps_gapped(self, line):
1394 self._params.hsps_gapped, = _get_cols( 1395 line, (-1,), ncols=6, expected={3:"gapped"}) 1396 self._params.hsps_gapped = _safe_int(self._params.hsps_gapped)
1397
1398 - def query_length(self, line):
1399 self._params.query_length, = _get_cols( 1400 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"query:"}) 1401 self._params.query_length = _safe_int(self._params.query_length)
1402
1403 - def database_length(self, line):
1404 self._params.database_length, = _get_cols( 1405 line.lower(), (-1,), ncols=4, expected={0:"length", 2:"database:"}) 1406 self._params.database_length = _safe_int(self._params.database_length)
1407
1408 - def effective_hsp_length(self, line):
1409 self._params.effective_hsp_length, = _get_cols( 1410 line, (-1,), ncols=4, expected={1:"HSP", 2:"length:"}) 1411 self._params.effective_hsp_length = _safe_int( 1412 self._params.effective_hsp_length)
1413
1414 - def effective_query_length(self, line):
1415 self._params.effective_query_length, = _get_cols( 1416 line, (-1,), ncols=5, expected={1:"length", 3:"query:"}) 1417 self._params.effective_query_length = _safe_int( 1418 self._params.effective_query_length)
1419
1420 - def effective_database_length(self, line):
1421 self._params.effective_database_length, = _get_cols( 1422 line.lower(), (-1,), ncols=5, expected={1:"length", 3:"database:"}) 1423 self._params.effective_database_length = _safe_int( 1424 self._params.effective_database_length)
1425
1426 - def effective_search_space(self, line):
1427 self._params.effective_search_space, = _get_cols( 1428 line, (-1,), ncols=4, expected={1:"search"}) 1429 self._params.effective_search_space = _safe_int( 1430 self._params.effective_search_space)
1431
1432 - def effective_search_space_used(self, line):
1433 self._params.effective_search_space_used, = _get_cols( 1434 line, (-1,), ncols=5, expected={1:"search", 3:"used:"}) 1435 self._params.effective_search_space_used = _safe_int( 1436 self._params.effective_search_space_used)
1437
1438 - def frameshift(self, line):
1439 self._params.frameshift = _get_cols( 1440 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1441
1442 - def threshold(self, line):
1443 if line[:2] == "T:": 1444 #Assume its an old stlye line like "T: 123" 1445 self._params.threshold, = _get_cols( 1446 line, (1,), ncols=2, expected={0:"T:"}) 1447 elif line[:28] == "Neighboring words threshold:": 1448 self._params.threshold, = _get_cols( 1449 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"}) 1450 else: 1451 raise ValueError("Unrecognised threshold line:\n%s" % line) 1452 self._params.threshold = _safe_int(self._params.threshold)
1453
1454 - def window_size(self, line):
1455 if line[:2] == "A:": 1456 self._params.window_size, = _get_cols( 1457 line, (1,), ncols=2, expected={0:"A:"}) 1458 elif line[:25] == "Window for multiple hits:": 1459 self._params.window_size, = _get_cols( 1460 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"}) 1461 else: 1462 raise ValueError("Unrecognised window size line:\n%s" % line) 1463 self._params.window_size = _safe_int(self._params.window_size)
1464
1465 - def dropoff_1st_pass(self, line):
1466 score, bits = _re_search( 1467 r"X1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1468 "I could not find the dropoff in line\n%s" % line) 1469 self._params.dropoff_1st_pass = _safe_int(score), _safe_float(bits)
1470
1471 - def gap_x_dropoff(self, line):
1472 score, bits = _re_search( 1473 r"X2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1474 "I could not find the gap dropoff in line\n%s" % line) 1475 self._params.gap_x_dropoff = _safe_int(score), _safe_float(bits)
1476
1477 - def gap_x_dropoff_final(self, line):
1478 score, bits = _re_search( 1479 r"X3: (\d+) \(\s*([0-9,.]+) bits\)", line, 1480 "I could not find the gap dropoff final in line\n%s" % line) 1481 self._params.gap_x_dropoff_final = _safe_int(score), _safe_float(bits)
1482
1483 - def gap_trigger(self, line):
1484 score, bits = _re_search( 1485 r"S1: (\d+) \(\s*([0-9,.]+) bits\)", line, 1486 "I could not find the gap trigger in line\n%s" % line) 1487 self._params.gap_trigger = _safe_int(score), _safe_float(bits)
1488
1489 - def blast_cutoff(self, line):
1490 score, bits = _re_search( 1491 r"S2: (\d+) \(\s*([0-9,.]+) bits\)", line, 1492 "I could not find the blast cutoff in line\n%s" % line) 1493 self._params.blast_cutoff = _safe_int(score), _safe_float(bits)
1494
1495 - def end_parameters(self):
1496 pass
1497 1498
1499 -class _BlastConsumer(AbstractConsumer, 1500 _HeaderConsumer, 1501 _DescriptionConsumer, 1502 _AlignmentConsumer, 1503 _HSPConsumer, 1504 _DatabaseReportConsumer, 1505 _ParametersConsumer 1506 ):
1507 # This Consumer is inherits from many other consumer classes that handle 1508 # the actual dirty work. An alternate way to do it is to create objects 1509 # of those classes and then delegate the parsing tasks to them in a 1510 # decorator-type pattern. The disadvantage of that is that the method 1511 # names will need to be resolved in this classes. However, using 1512 # a decorator will retain more control in this class (which may or 1513 # may not be a bad thing). In addition, having each sub-consumer as 1514 # its own object prevents this object's dictionary from being cluttered 1515 # with members and reduces the chance of member collisions.
1516 - def __init__(self):
1517 self.data = None
1518
1519 - def round(self, line):
1520 # Make sure nobody's trying to pass me PSI-BLAST data! 1521 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1522
1523 - def start_header(self):
1524 self.data = Record.Blast() 1525 _HeaderConsumer.start_header(self)
1526
1527 - def end_header(self):
1528 _HeaderConsumer.end_header(self) 1529 self.data.__dict__.update(self._header.__dict__)
1530
1531 - def end_descriptions(self):
1532 self.data.descriptions = self._descriptions
1533
1534 - def end_alignment(self):
1535 _AlignmentConsumer.end_alignment(self) 1536 if self._alignment.hsps: 1537 self.data.alignments.append(self._alignment) 1538 if self._multiple_alignment.alignment: 1539 self.data.multiple_alignment = self._multiple_alignment
1540
1541 - def end_hsp(self):
1542 _HSPConsumer.end_hsp(self) 1543 try: 1544 self._alignment.hsps.append(self._hsp) 1545 except AttributeError: 1546 raise ValueError("Found an HSP before an alignment")
1547
1548 - def end_database_report(self):
1549 _DatabaseReportConsumer.end_database_report(self) 1550 self.data.__dict__.update(self._dr.__dict__)
1551
1552 - def end_parameters(self):
1553 _ParametersConsumer.end_parameters(self) 1554 self.data.__dict__.update(self._params.__dict__)
1555 1556
1557 -class _PSIBlastConsumer(AbstractConsumer, 1558 _HeaderConsumer, 1559 _DescriptionConsumer, 1560 _AlignmentConsumer, 1561 _HSPConsumer, 1562 _DatabaseReportConsumer, 1563 _ParametersConsumer 1564 ):
1565 - def __init__(self):
1566 self.data = None
1567
1568 - def start_header(self):
1569 self.data = Record.PSIBlast() 1570 _HeaderConsumer.start_header(self)
1571
1572 - def end_header(self):
1573 _HeaderConsumer.end_header(self) 1574 self.data.__dict__.update(self._header.__dict__)
1575
1576 - def start_descriptions(self):
1577 self._round = Record.Round() 1578 self.data.rounds.append(self._round) 1579 _DescriptionConsumer.start_descriptions(self)
1580
1581 - def end_descriptions(self):
1582 _DescriptionConsumer.end_descriptions(self) 1583 self._round.number = self._roundnum 1584 if self._descriptions: 1585 self._round.new_seqs.extend(self._descriptions) 1586 self._round.reused_seqs.extend(self._model_sequences) 1587 self._round.new_seqs.extend(self._nonmodel_sequences) 1588 if self._converged: 1589 self.data.converged = 1
1590
1591 - def end_alignment(self):
1592 _AlignmentConsumer.end_alignment(self) 1593 if self._alignment.hsps: 1594 self._round.alignments.append(self._alignment) 1595 if self._multiple_alignment: 1596 self._round.multiple_alignment = self._multiple_alignment
1597
1598 - def end_hsp(self):
1599 _HSPConsumer.end_hsp(self) 1600 try: 1601 self._alignment.hsps.append(self._hsp) 1602 except AttributeError: 1603 raise ValueError("Found an HSP before an alignment")
1604
1605 - def end_database_report(self):
1606 _DatabaseReportConsumer.end_database_report(self) 1607 self.data.__dict__.update(self._dr.__dict__)
1608
1609 - def end_parameters(self):
1610 _ParametersConsumer.end_parameters(self) 1611 self.data.__dict__.update(self._params.__dict__)
1612 1613
1614 -class Iterator(object):
1615 """Iterates over a file of multiple BLAST results. 1616 1617 Methods: 1618 next Return the next record from the stream, or None. 1619 1620 """
1621 - def __init__(self, handle, parser=None):
1622 """__init__(self, handle, parser=None) 1623 1624 Create a new iterator. handle is a file-like object. parser 1625 is an optional Parser object to change the results into another form. 1626 If set to None, then the raw contents of the file will be returned. 1627 1628 """ 1629 try: 1630 handle.readline 1631 except AttributeError: 1632 raise ValueError( 1633 "I expected a file handle or file-like object, got %s" 1634 % type(handle)) 1635 self._uhandle = File.UndoHandle(handle) 1636 self._parser = parser 1637 self._header = []
1638
1639 - def __next__(self):
1640 """next(self) -> object 1641 1642 Return the next Blast record from the file. If no more records, 1643 return None. 1644 1645 """ 1646 lines = [] 1647 query = False 1648 while True: 1649 line = self._uhandle.readline() 1650 if not line: 1651 break 1652 # If I've reached the next one, then put the line back and stop. 1653 if lines and (line.startswith('BLAST') 1654 or line.startswith('BLAST', 1) 1655 or line.startswith('<?xml ')): 1656 self._uhandle.saveline(line) 1657 break 1658 # New style files omit the BLAST line to mark a new query: 1659 if line.startswith("Query="): 1660 if not query: 1661 if not self._header: 1662 self._header = lines[:] 1663 query = True 1664 else: 1665 #Start of another record 1666 self._uhandle.saveline(line) 1667 break 1668 lines.append(line) 1669 1670 if query and "BLAST" not in lines[0]: 1671 #Cheat and re-insert the header 1672 #print "-"*50 1673 #print "".join(self._header) 1674 #print "-"*50 1675 #print "".join(lines) 1676 #print "-"*50 1677 lines = self._header + lines 1678 1679 if not lines: 1680 return None 1681 1682 data = ''.join(lines) 1683 if self._parser is not None: 1684 return self._parser.parse(StringIO(data)) 1685 return data
1686 1687 if sys.version_info[0] < 3:
1688 - def next(self):
1689 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 1690 import warnings 1691 from Bio import BiopythonDeprecationWarning 1692 warnings.warn("Please use next(my_iterator) instead of my_iterator.next(), " 1693 "the .next() method is deprecated and will be removed in a " 1694 "future release of Biopython.", BiopythonDeprecationWarning) 1695 return self.__next__()
1696
1697 - def __iter__(self):
1698 return iter(self.__next__, None)
1699 1700
1701 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1702 """Execute and retrieve data from standalone BLASTPALL as handles (DEPRECATED). 1703 1704 NOTE - This function is deprecated, you are encouraged to the command 1705 line wrapper Bio.Blast.Applications.BlastallCommandline instead, or 1706 better the BLAST+ command line wrappers in Bio.Blast.Applications. 1707 1708 Execute and retrieve data from blastall. blastcmd is the command 1709 used to launch the 'blastall' executable. program is the blast program 1710 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database 1711 to search against. infile is the path to the file containing 1712 the sequence to search with. 1713 1714 The return values are two handles, for standard output and standard error. 1715 1716 You may pass more parameters to **keywds to change the behavior of 1717 the search. Otherwise, optional values will be chosen by blastall. 1718 The Blast output is by default in XML format. Use the align_view keyword 1719 for output in a different format. 1720 1721 Scoring 1722 matrix Matrix to use. 1723 gap_open Gap open penalty. 1724 gap_extend Gap extension penalty. 1725 nuc_match Nucleotide match reward. (BLASTN) 1726 nuc_mismatch Nucleotide mismatch penalty. (BLASTN) 1727 query_genetic_code Genetic code for Query. 1728 db_genetic_code Genetic code for database. (TBLAST[NX]) 1729 1730 Algorithm 1731 gapped Whether to do a gapped alignment. T/F (not for TBLASTX) 1732 expectation Expectation value cutoff. 1733 wordsize Word size. 1734 strands Query strands to search against database.([T]BLAST[NX]) 1735 keep_hits Number of best hits from a region to keep. 1736 xdrop Dropoff value (bits) for gapped alignments. 1737 hit_extend Threshold for extending hits. 1738 region_length Length of region used to judge hits. 1739 db_length Effective database length. 1740 search_length Effective length of search space. 1741 1742 Processing 1743 filter Filter query sequence for low complexity (with SEG)? T/F 1744 believe_query Believe the query defline. T/F 1745 restrict_gi Restrict search to these GI's. 1746 nprocessors Number of processors to use. 1747 oldengine Force use of old engine T/F 1748 1749 Formatting 1750 html Produce HTML output? T/F 1751 descriptions Number of one-line descriptions. 1752 alignments Number of alignments. 1753 align_view Alignment view. Integer 0-11, 1754 passed as a string or integer. 1755 show_gi Show GI's in deflines? T/F 1756 seqalign_file seqalign file to output. 1757 outfile Output file for report. Filename to write to, if 1758 omitted standard output is used (which you can access 1759 from the returned handles). 1760 """ 1761 1762 _security_check_parameters(keywds) 1763 1764 att2param = { 1765 'matrix': '-M', 1766 'gap_open': '-G', 1767 'gap_extend': '-E', 1768 'nuc_match': '-r', 1769 'nuc_mismatch': '-q', 1770 'query_genetic_code': '-Q', 1771 'db_genetic_code': '-D', 1772 1773 'gapped': '-g', 1774 'expectation': '-e', 1775 'wordsize': '-W', 1776 'strands': '-S', 1777 'keep_hits': '-K', 1778 'xdrop': '-X', 1779 'hit_extend': '-f', 1780 'region_length': '-L', 1781 'db_length': '-z', 1782 'search_length': '-Y', 1783 1784 'program': '-p', 1785 'database': '-d', 1786 'infile': '-i', 1787 'filter': '-F', 1788 'believe_query': '-J', 1789 'restrict_gi': '-l', 1790 'nprocessors': '-a', 1791 'oldengine': '-V', 1792 1793 'html': '-T', 1794 'descriptions': '-v', 1795 'alignments': '-b', 1796 'align_view': '-m', 1797 'show_gi': '-I', 1798 'seqalign_file': '-O', 1799 'outfile': '-o', 1800 } 1801 from .Applications import BlastallCommandline 1802 cline = BlastallCommandline(blastcmd) 1803 cline.set_parameter(att2param['program'], program) 1804 cline.set_parameter(att2param['database'], database) 1805 cline.set_parameter(att2param['infile'], infile) 1806 cline.set_parameter(att2param['align_view'], str(align_view)) 1807 for key, value in keywds.items(): 1808 cline.set_parameter(att2param[key], str(value)) 1809 return _invoke_blast(cline)
1810 1811
1812 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1813 """Execute and retrieve data from standalone BLASTPGP as handles (DEPRECATED). 1814 1815 NOTE - This function is deprecated, you are encouraged to the command 1816 line wrapper Bio.Blast.Applications.BlastpgpCommandline instead, or 1817 better the BLAST+ tool psiblast via the NcbipsiblastCommandline wrapper. 1818 1819 Execute and retrieve data from blastpgp. blastcmd is the command 1820 used to launch the 'blastpgp' executable. database is the path to the 1821 database to search against. infile is the path to the file containing 1822 the sequence to search with. 1823 1824 The return values are two handles, for standard output and standard error. 1825 1826 You may pass more parameters to **keywds to change the behavior of 1827 the search. Otherwise, optional values will be chosen by blastpgp. 1828 The Blast output is by default in XML format. Use the align_view keyword 1829 for output in a different format. 1830 1831 Scoring 1832 matrix Matrix to use. 1833 gap_open Gap open penalty. 1834 gap_extend Gap extension penalty. 1835 window_size Multiple hits window size. 1836 npasses Number of passes. 1837 passes Hits/passes. Integer 0-2. 1838 1839 Algorithm 1840 gapped Whether to do a gapped alignment. T/F 1841 expectation Expectation value cutoff. 1842 wordsize Word size. 1843 keep_hits Number of beset hits from a region to keep. 1844 xdrop Dropoff value (bits) for gapped alignments. 1845 hit_extend Threshold for extending hits. 1846 region_length Length of region used to judge hits. 1847 db_length Effective database length. 1848 search_length Effective length of search space. 1849 nbits_gapping Number of bits to trigger gapping. 1850 pseudocounts Pseudocounts constants for multiple passes. 1851 xdrop_final X dropoff for final gapped alignment. 1852 xdrop_extension Dropoff for blast extensions. 1853 model_threshold E-value threshold to include in multipass model. 1854 required_start Start of required region in query. 1855 required_end End of required region in query. 1856 1857 Processing 1858 XXX should document default values 1859 program The blast program to use. (PHI-BLAST) 1860 filter Filter query sequence for low complexity (with SEG)? T/F 1861 believe_query Believe the query defline? T/F 1862 nprocessors Number of processors to use. 1863 1864 Formatting 1865 html Produce HTML output? T/F 1866 descriptions Number of one-line descriptions. 1867 alignments Number of alignments. 1868 align_view Alignment view. Integer 0-11, 1869 passed as a string or integer. 1870 show_gi Show GI's in deflines? T/F 1871 seqalign_file seqalign file to output. 1872 align_outfile Output file for alignment. 1873 checkpoint_outfile Output file for PSI-BLAST checkpointing. 1874 restart_infile Input file for PSI-BLAST restart. 1875 hit_infile Hit file for PHI-BLAST. 1876 matrix_outfile Output file for PSI-BLAST matrix in ASCII. 1877 align_outfile Output file for alignment. Filename to write to, if 1878 omitted standard output is used (which you can access 1879 from the returned handles). 1880 1881 align_infile Input alignment file for PSI-BLAST restart. 1882 """ 1883 _security_check_parameters(keywds) 1884 1885 att2param = { 1886 'matrix': '-M', 1887 'gap_open': '-G', 1888 'gap_extend': '-E', 1889 'window_size': '-A', 1890 'npasses': '-j', 1891 'passes': '-P', 1892 1893 'gapped': '-g', 1894 'expectation': '-e', 1895 'wordsize': '-W', 1896 'keep_hits': '-K', 1897 'xdrop': '-X', 1898 'hit_extend': '-f', 1899 'region_length': '-L', 1900 'db_length': '-Z', 1901 'search_length': '-Y', 1902 'nbits_gapping': '-N', 1903 'pseudocounts': '-c', 1904 'xdrop_final': '-Z', 1905 'xdrop_extension': '-y', 1906 'model_threshold': '-h', 1907 'required_start': '-S', 1908 'required_end': '-H', 1909 1910 'program': '-p', 1911 'database': '-d', 1912 'infile': '-i', 1913 'filter': '-F', 1914 'believe_query': '-J', 1915 'nprocessors': '-a', 1916 1917 'html': '-T', 1918 'descriptions': '-v', 1919 'alignments': '-b', 1920 'align_view': '-m', 1921 'show_gi': '-I', 1922 'seqalign_file': '-O', 1923 'align_outfile': '-o', 1924 'checkpoint_outfile': '-C', 1925 'restart_infile': '-R', 1926 'hit_infile': '-k', 1927 'matrix_outfile': '-Q', 1928 'align_infile': '-B', 1929 } 1930 from .Applications import BlastpgpCommandline 1931 cline = BlastpgpCommandline(blastcmd) 1932 cline.set_parameter(att2param['database'], database) 1933 cline.set_parameter(att2param['infile'], infile) 1934 cline.set_parameter(att2param['align_view'], str(align_view)) 1935 for key, value in keywds.items(): 1936 cline.set_parameter(att2param[key], str(value)) 1937 return _invoke_blast(cline)
1938 1939
1940 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1941 """Execute and retrieve data from standalone RPS-BLAST as handles (DEPRECATED). 1942 1943 NOTE - This function is deprecated, you are encouraged to the command 1944 line wrapper Bio.Blast.Applications.RpsBlastCommandline instead, or 1945 better the BLAST+ rpsblast wrapper NcbirpsblastCommandline. 1946 1947 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the 1948 command used to launch the 'rpsblast' executable. database is the path 1949 to the database to search against. infile is the path to the file 1950 containing the sequence to search with. 1951 1952 The return values are two handles, for standard output and standard error. 1953 1954 You may pass more parameters to **keywds to change the behavior of 1955 the search. Otherwise, optional values will be chosen by rpsblast. 1956 1957 Please note that this function will give XML output by default, by 1958 setting align_view to seven (i.e. command line option -m 7). 1959 You should use the NCBIXML.parse() function to read the resulting output. 1960 This is because NCBIStandalone.BlastParser() does not understand the 1961 plain text output format from rpsblast. 1962 1963 WARNING - The following text and associated parameter handling has not 1964 received extensive testing. Please report any errors we might have made... 1965 1966 Algorithm/Scoring 1967 gapped Whether to do a gapped alignment. T/F 1968 multihit 0 for multiple hit (default), 1 for single hit 1969 expectation Expectation value cutoff. 1970 range_restriction Range restriction on query sequence (Format: start,stop) blastp only 1971 0 in 'start' refers to the beginning of the sequence 1972 0 in 'stop' refers to the end of the sequence 1973 Default = 0,0 1974 xdrop Dropoff value (bits) for gapped alignments. 1975 xdrop_final X dropoff for final gapped alignment (in bits). 1976 xdrop_extension Dropoff for blast extensions (in bits). 1977 search_length Effective length of search space. 1978 nbits_gapping Number of bits to trigger gapping. 1979 protein Query sequence is protein. T/F 1980 db_length Effective database length. 1981 1982 Processing 1983 filter Filter query sequence for low complexity? T/F 1984 case_filter Use lower case filtering of FASTA sequence T/F, default F 1985 believe_query Believe the query defline. T/F 1986 nprocessors Number of processors to use. 1987 logfile Name of log file to use, default rpsblast.log 1988 1989 Formatting 1990 html Produce HTML output? T/F 1991 descriptions Number of one-line descriptions. 1992 alignments Number of alignments. 1993 align_view Alignment view. Integer 0-11, 1994 passed as a string or integer. 1995 show_gi Show GI's in deflines? T/F 1996 seqalign_file seqalign file to output. 1997 align_outfile Output file for alignment. Filename to write to, if 1998 omitted standard output is used (which you can access 1999 from the returned handles). 2000 """ 2001 _security_check_parameters(keywds) 2002 2003 att2param = { 2004 'multihit': '-P', 2005 'gapped': '-g', 2006 'expectation': '-e', 2007 'range_restriction': '-L', 2008 'xdrop': '-X', 2009 'xdrop_final': '-Z', 2010 'xdrop_extension': '-y', 2011 'search_length': '-Y', 2012 'nbits_gapping': '-N', 2013 'protein': '-p', 2014 'db_length': '-z', 2015 2016 'database': '-d', 2017 'infile': '-i', 2018 'filter': '-F', 2019 'case_filter': '-U', 2020 'believe_query': '-J', 2021 'nprocessors': '-a', 2022 'logfile': '-l', 2023 2024 'html': '-T', 2025 'descriptions': '-v', 2026 'alignments': '-b', 2027 'align_view': '-m', 2028 'show_gi': '-I', 2029 'seqalign_file': '-O', 2030 'align_outfile': '-o', 2031 } 2032 2033 from .Applications import RpsBlastCommandline 2034 cline = RpsBlastCommandline(blastcmd) 2035 cline.set_parameter(att2param['database'], database) 2036 cline.set_parameter(att2param['infile'], infile) 2037 cline.set_parameter(att2param['align_view'], str(align_view)) 2038 for key, value in keywds.items(): 2039 cline.set_parameter(att2param[key], str(value)) 2040 return _invoke_blast(cline)
2041 2042
2043 -def _re_search(regex, line, error_msg):
2044 m = re.search(regex, line) 2045 if not m: 2046 raise ValueError(error_msg) 2047 return m.groups()
2048 2049
2050 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
2051 cols = line.split() 2052 2053 # Check to make sure number of columns is correct 2054 if ncols is not None and len(cols) != ncols: 2055 raise ValueError("I expected %d columns (got %d) in line\n%s" 2056 % (ncols, len(cols), line)) 2057 2058 # Check to make sure columns contain the correct data 2059 for k in expected: 2060 if cols[k] != expected[k]: 2061 raise ValueError("I expected '%s' in column %d in line\n%s" 2062 % (expected[k], k, line)) 2063 2064 # Construct the answer tuple 2065 results = [] 2066 for c in cols_to_get: 2067 results.append(cols[c]) 2068 return tuple(results)
2069 2070
2071 -def _safe_int(str):
2072 try: 2073 return int(str) 2074 except ValueError: 2075 # Something went wrong. Try to clean up the string. 2076 # Remove all commas from the string 2077 str = str.replace(',', '') 2078 # try again after removing commas. 2079 # Note int() will return a long rather than overflow 2080 try: 2081 return int(str) 2082 except ValueError: 2083 pass 2084 # Call float to handle things like "54.3", note could lose precision, e.g. 2085 # >>> int("5399354557888517312") 2086 # 5399354557888517312 2087 # >>> int(float("5399354557888517312")) 2088 # 5399354557888517120 2089 return int(float(str))
2090 2091
2092 -def _safe_float(str):
2093 # Thomas Rosleff Soerensen (rosleff@mpiz-koeln.mpg.de) noted that 2094 # float('e-172') does not produce an error on his platform. Thus, 2095 # we need to check the string for this condition. 2096 2097 # Sometimes BLAST leaves of the '1' in front of an exponent. 2098 if str and str[0] in ['E', 'e']: 2099 str = '1' + str 2100 try: 2101 return float(str) 2102 except ValueError: 2103 # Remove all commas from the string 2104 str = str.replace(',', '') 2105 # try again. 2106 return float(str)
2107 2108
2109 -def _invoke_blast(cline):
2110 """Start BLAST and returns handles for stdout and stderr (PRIVATE). 2111 2112 Expects a command line wrapper object from Bio.Blast.Applications 2113 """ 2114 import subprocess 2115 import sys 2116 blast_cmd = cline.program_name 2117 if not os.path.exists(blast_cmd): 2118 raise ValueError("BLAST executable does not exist at %s" % blast_cmd) 2119 #We don't need to supply any piped input, but we setup the 2120 #standard input pipe anyway as a work around for a python 2121 #bug if this is called from a Windows GUI program. For 2122 #details, see http://bugs.python.org/issue1124861 2123 blast_process = subprocess.Popen(str(cline), 2124 stdin=subprocess.PIPE, 2125 stdout=subprocess.PIPE, 2126 stderr=subprocess.PIPE, 2127 universal_newlines=True, 2128 shell=(sys.platform!="win32")) 2129 blast_process.stdin.close() 2130 return blast_process.stdout, blast_process.stderr
2131 2132
2133 -def _security_check_parameters(param_dict):
2134 """Look for any attempt to insert a command into a parameter. 2135 2136 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd') 2137 2138 Looks for ";" or "&&" in the strings (Unix and Windows syntax 2139 for appending a command line), or ">", "<" or "|" (redirection) 2140 and if any are found raises an exception. 2141 """ 2142 for key, value in param_dict.items(): 2143 str_value = str(value) # Could easily be an int or a float 2144 for bad_str in [";", "&&", ">", "<", "|"]: 2145 if bad_str in str_value: 2146 raise ValueError("Rejecting suspicious argument for %s" % key)
2147 2148
2149 -class _BlastErrorConsumer(_BlastConsumer):
2150 - def __init__(self):
2152
2153 - def noevent(self, line):
2154 if 'Query must be at least wordsize' in line: 2155 raise ShortQueryBlastError("Query must be at least wordsize") 2156 # Now pass the line back up to the superclass. 2157 method = getattr(_BlastConsumer, 'noevent', 2158 _BlastConsumer.__getattr__(self, 'noevent')) 2159 method(line)
2160 2161
2162 -class BlastErrorParser(AbstractParser):
2163 """Attempt to catch and diagnose BLAST errors while parsing. 2164 2165 This utilizes the BlastParser module but adds an additional layer 2166 of complexity on top of it by attempting to diagnose ValueErrors 2167 that may actually indicate problems during BLAST parsing. 2168 2169 Current BLAST problems this detects are: 2170 o LowQualityBlastError - When BLASTing really low quality sequences 2171 (ie. some GenBank entries which are just short streches of a single 2172 nucleotide), BLAST will report an error with the sequence and be 2173 unable to search with this. This will lead to a badly formatted 2174 BLAST report that the parsers choke on. The parser will convert the 2175 ValueError to a LowQualityBlastError and attempt to provide useful 2176 information. 2177 """
2178 - def __init__(self, bad_report_handle = None):
2179 """Initialize a parser that tries to catch BlastErrors. 2180 2181