Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """Code to invoke the NCBI BLAST server over the internet. 
 10   
 11  This module provides code to work with the WWW version of BLAST 
 12  provided by the NCBI. 
 13  http://blast.ncbi.nlm.nih.gov/ 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio._py3k import StringIO 
 19  from Bio._py3k import _as_string, _as_bytes 
 20  from Bio._py3k import urlopen as _urlopen 
 21  from Bio._py3k import urlencode as _urlencode 
 22  from Bio._py3k import Request as _Request 
 23   
 24   
 25  NCBI_BLAST_URL = "http://blast.ncbi.nlm.nih.gov/Blast.cgi" 
 26   
 27   
28 -def qblast(program, database, sequence, url_base=NCBI_BLAST_URL, 29 auto_format=None, composition_based_statistics=None, 30 db_genetic_code=None, endpoints=None, entrez_query='(none)', 31 expect=10.0, filter=None, gapcosts=None, genetic_code=None, 32 hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, 33 matrix_name=None, nucl_penalty=None, nucl_reward=None, 34 other_advanced=None, perc_ident=None, phi_pattern=None, 35 query_file=None, query_believe_defline=None, query_from=None, 36 query_to=None, searchsp_eff=None, service=None, threshold=None, 37 ungapped_alignment=None, word_size=None, 38 alignments=500, alignment_view=None, descriptions=500, 39 entrez_links_new_window=None, expect_low=None, expect_high=None, 40 format_entrez_query=None, format_object=None, format_type='XML', 41 ncbi_gi=None, results_file=None, show_overview=None, megablast=None, 42 ):
43 """Do a BLAST search using the QBLAST server at NCBI or a cloud service 44 provider. 45 46 Supports all parameters of the qblast API for Put and Get. 47 48 Please note that BLAST on the cloud supports the NCBI-BLAST Common 49 URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To 50 use this feature, please set url_base to 51 'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and 52 format_object='Alignment'. For more details, please see 53 https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast 54 55 Some useful parameters: 56 57 - program blastn, blastp, blastx, tblastn, or tblastx (lower case) 58 - database Which database to search against (e.g. "nr"). 59 - sequence The sequence to search. 60 - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 61 - descriptions Number of descriptions to show. Def 500. 62 - alignments Number of alignments to show. Def 500. 63 - expect An expect value cutoff. Def 10.0. 64 - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 65 - filter "none" turns off filtering. Default no filtering 66 - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 67 - entrez_query Entrez query to limit Blast search 68 - hitlist_size Number of hits to return. Default 50 69 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 70 - service plain, psi, phi, rpsblast, megablast (lower case) 71 72 This function does no checking of the validity of the parameters 73 and passes the values to the server as is. More help is available at: 74 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html 75 76 """ 77 import time 78 79 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 80 81 # Format the "Put" command, which sends search requests to qblast. 82 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 83 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 84 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified 85 # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) 86 parameters = [ 87 ('AUTO_FORMAT', auto_format), 88 ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), 89 ('DATABASE', database), 90 ('DB_GENETIC_CODE', db_genetic_code), 91 ('ENDPOINTS', endpoints), 92 ('ENTREZ_QUERY', entrez_query), 93 ('EXPECT', expect), 94 ('FILTER', filter), 95 ('GAPCOSTS', gapcosts), 96 ('GENETIC_CODE', genetic_code), 97 ('HITLIST_SIZE', hitlist_size), 98 ('I_THRESH', i_thresh), 99 ('LAYOUT', layout), 100 ('LCASE_MASK', lcase_mask), 101 ('MEGABLAST', megablast), 102 ('MATRIX_NAME', matrix_name), 103 ('NUCL_PENALTY', nucl_penalty), 104 ('NUCL_REWARD', nucl_reward), 105 ('OTHER_ADVANCED', other_advanced), 106 ('PERC_IDENT', perc_ident), 107 ('PHI_PATTERN', phi_pattern), 108 ('PROGRAM', program), 109 # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? 110 ('QUERY', sequence), 111 ('QUERY_FILE', query_file), 112 ('QUERY_BELIEVE_DEFLINE', query_believe_defline), 113 ('QUERY_FROM', query_from), 114 ('QUERY_TO', query_to), 115 # ('RESULTS_FILE',...), - Can we use this parameter? 116 ('SEARCHSP_EFF', searchsp_eff), 117 ('SERVICE', service), 118 ('THRESHOLD', threshold), 119 ('UNGAPPED_ALIGNMENT', ungapped_alignment), 120 ('WORD_SIZE', word_size), 121 ('CMD', 'Put'), 122 ] 123 query = [x for x in parameters if x[1] is not None] 124 message = _as_bytes(_urlencode(query)) 125 126 # Send off the initial query to qblast. 127 # Note the NCBI do not currently impose a rate limit here, other 128 # than the request not to make say 50 queries at once using multiple 129 # threads. 130 request = _Request(url_base, 131 message, 132 {"User-Agent": "BiopythonClient"}) 133 handle = _urlopen(request) 134 135 # Format the "Get" command, which gets the formatted results from qblast 136 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 137 rid, rtoe = _parse_qblast_ref_page(handle) 138 parameters = [ 139 ('ALIGNMENTS', alignments), 140 ('ALIGNMENT_VIEW', alignment_view), 141 ('DESCRIPTIONS', descriptions), 142 ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), 143 ('EXPECT_LOW', expect_low), 144 ('EXPECT_HIGH', expect_high), 145 ('FORMAT_ENTREZ_QUERY', format_entrez_query), 146 ('FORMAT_OBJECT', format_object), 147 ('FORMAT_TYPE', format_type), 148 ('NCBI_GI', ncbi_gi), 149 ('RID', rid), 150 ('RESULTS_FILE', results_file), 151 ('SERVICE', service), 152 ('SHOW_OVERVIEW', show_overview), 153 ('CMD', 'Get'), 154 ] 155 query = [x for x in parameters if x[1] is not None] 156 message = _as_bytes(_urlencode(query)) 157 158 # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait 159 delay = 2.0 160 previous = time.time() 161 while True: 162 current = time.time() 163 wait = previous + delay - current 164 if wait > 0: 165 time.sleep(wait) 166 previous = current + wait 167 else: 168 previous = current 169 if delay + .5 * delay <= 120: 170 delay += .5 * delay 171 else: 172 delay = 120 173 174 request = _Request(url_base, 175 message, 176 {"User-Agent": "BiopythonClient"}) 177 handle = _urlopen(request) 178 results = _as_string(handle.read()) 179 180 # Can see an "\n\n" page while results are in progress, 181 # if so just wait a bit longer... 182 if results == "\n\n": 183 continue 184 # XML results don't have the Status tag when finished 185 if "Status=" not in results: 186 break 187 i = results.index("Status=") 188 j = results.index("\n", i) 189 status = results[i + len("Status="):j].strip() 190 if status.upper() == "READY": 191 break 192 193 return StringIO(results)
194 195
196 -def _parse_qblast_ref_page(handle):
197 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 198 199 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably 200 'Request Time of Execution' and RID would be 'Request Identifier'. 201 """ 202 s = _as_string(handle.read()) 203 i = s.find("RID =") 204 if i == -1: 205 rid = None 206 else: 207 j = s.find("\n", i) 208 rid = s[i + len("RID ="):j].strip() 209 210 i = s.find("RTOE =") 211 if i == -1: 212 rtoe = None 213 else: 214 j = s.find("\n", i) 215 rtoe = s[i + len("RTOE ="):j].strip() 216 217 if not rid and not rtoe: 218 # Can we reliably extract the error message from the HTML page? 219 # e.g. "Message ID#24 Error: Failed to read the Blast query: 220 # Nucleotide FASTA provided for protein sequence" 221 # or "Message ID#32 Error: Query contains no data: Query 222 # contains no sequence data" 223 # 224 # This used to occur inside a <div class="error msInf"> entry: 225 i = s.find('<div class="error msInf">') 226 if i != -1: 227 msg = s[i + len('<div class="error msInf">'):].strip() 228 msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip() 229 if msg: 230 raise ValueError("Error message from NCBI: %s" % msg) 231 # In spring 2010 the markup was like this: 232 i = s.find('<p class="error">') 233 if i != -1: 234 msg = s[i + len('<p class="error">'):].strip() 235 msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip() 236 if msg: 237 raise ValueError("Error message from NCBI: %s" % msg) 238 # Generic search based on the way the error messages start: 239 i = s.find('Message ID#') 240 if i != -1: 241 # Break the message at the first HTML tag 242 msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() 243 raise ValueError("Error message from NCBI: %s" % msg) 244 # We didn't recognise the error layout :( 245 # print s 246 raise ValueError("No RID and no RTOE found in the 'please wait' page, " 247 "there was probably an error in your request but we " 248 "could not extract a helpful error message.") 249 elif not rid: 250 # Can this happen? 251 raise ValueError("No RID found in the 'please wait' page." 252 " (although RTOE = %s)" % repr(rtoe)) 253 elif not rtoe: 254 # Can this happen? 255 raise ValueError("No RTOE found in the 'please wait' page." 256 " (although RID = %s)" % repr(rid)) 257 258 try: 259 return rid, int(rtoe) 260 except ValueError: 261 raise ValueError("A non-integer RTOE found in " 262 "the 'please wait' page, %s" % repr(rtoe))
263