Package Bio :: Package Blast :: Module NCBIWWW
[hide private]
[frames] | no frames]

Source Code for Module Bio.Blast.NCBIWWW

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # Patched by Brad Chapman. 
  7  # Chris Wroe added modifications for work in myGrid 
  8   
  9  """Code to invoke the NCBI BLAST server over the internet. 
 10   
 11  This module provides code to work with the WWW version of BLAST 
 12  provided by the NCBI. https://blast.ncbi.nlm.nih.gov/ 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio._py3k import StringIO 
 18  from Bio._py3k import _as_string, _as_bytes 
 19  from Bio._py3k import urlopen as _urlopen 
 20  from Bio._py3k import urlencode as _urlencode 
 21  from Bio._py3k import Request as _Request 
 22   
 23   
 24  NCBI_BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" 
 25   
 26   
27 -def qblast(program, database, sequence, url_base=NCBI_BLAST_URL, 28 auto_format=None, composition_based_statistics=None, 29 db_genetic_code=None, endpoints=None, entrez_query='(none)', 30 expect=10.0, filter=None, gapcosts=None, genetic_code=None, 31 hitlist_size=50, i_thresh=None, layout=None, lcase_mask=None, 32 matrix_name=None, nucl_penalty=None, nucl_reward=None, 33 other_advanced=None, perc_ident=None, phi_pattern=None, 34 query_file=None, query_believe_defline=None, query_from=None, 35 query_to=None, searchsp_eff=None, service=None, threshold=None, 36 ungapped_alignment=None, word_size=None, 37 alignments=500, alignment_view=None, descriptions=500, 38 entrez_links_new_window=None, expect_low=None, expect_high=None, 39 format_entrez_query=None, format_object=None, format_type='XML', 40 ncbi_gi=None, results_file=None, show_overview=None, megablast=None, 41 ):
42 """Do a BLAST search using the QBLAST server at NCBI or a cloud service 43 provider. 44 45 Supports all parameters of the qblast API for Put and Get. 46 47 Please note that BLAST on the cloud supports the NCBI-BLAST Common 48 URL API (http://ncbi.github.io/blast-cloud/dev/api.html). To 49 use this feature, please set url_base to 50 'http://host.my.cloud.service.provider.com/cgi-bin/blast.cgi' and 51 format_object='Alignment'. For more details, please see 52 https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=CloudBlast 53 54 Some useful parameters: 55 56 - program blastn, blastp, blastx, tblastn, or tblastx (lower case) 57 - database Which database to search against (e.g. "nr"). 58 - sequence The sequence to search. 59 - ncbi_gi TRUE/FALSE whether to give 'gi' identifier. 60 - descriptions Number of descriptions to show. Def 500. 61 - alignments Number of alignments to show. Def 500. 62 - expect An expect value cutoff. Def 10.0. 63 - matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45). 64 - filter "none" turns off filtering. Default no filtering 65 - format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML". 66 - entrez_query Entrez query to limit Blast search 67 - hitlist_size Number of hits to return. Default 50 68 - megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only) 69 - service plain, psi, phi, rpsblast, megablast (lower case) 70 71 This function does no checking of the validity of the parameters 72 and passes the values to the server as is. More help is available at: 73 http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html 74 75 """ 76 import time 77 78 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx'] 79 80 # Format the "Put" command, which sends search requests to qblast. 81 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node5.html on 9 July 2007 82 # Additional parameters are taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node9.html on 8 Oct 2010 83 # To perform a PSI-BLAST or PHI-BLAST search the service ("Put" and "Get" commands) must be specified 84 # (e.g. psi_blast = NCBIWWW.qblast("blastp", "refseq_protein", input_sequence, service="psi")) 85 parameters = [ 86 ('AUTO_FORMAT', auto_format), 87 ('COMPOSITION_BASED_STATISTICS', composition_based_statistics), 88 ('DATABASE', database), 89 ('DB_GENETIC_CODE', db_genetic_code), 90 ('ENDPOINTS', endpoints), 91 ('ENTREZ_QUERY', entrez_query), 92 ('EXPECT', expect), 93 ('FILTER', filter), 94 ('GAPCOSTS', gapcosts), 95 ('GENETIC_CODE', genetic_code), 96 ('HITLIST_SIZE', hitlist_size), 97 ('I_THRESH', i_thresh), 98 ('LAYOUT', layout), 99 ('LCASE_MASK', lcase_mask), 100 ('MEGABLAST', megablast), 101 ('MATRIX_NAME', matrix_name), 102 ('NUCL_PENALTY', nucl_penalty), 103 ('NUCL_REWARD', nucl_reward), 104 ('OTHER_ADVANCED', other_advanced), 105 ('PERC_IDENT', perc_ident), 106 ('PHI_PATTERN', phi_pattern), 107 ('PROGRAM', program), 108 # ('PSSM',pssm), - It is possible to use PSI-BLAST via this API? 109 ('QUERY', sequence), 110 ('QUERY_FILE', query_file), 111 ('QUERY_BELIEVE_DEFLINE', query_believe_defline), 112 ('QUERY_FROM', query_from), 113 ('QUERY_TO', query_to), 114 # ('RESULTS_FILE',...), - Can we use this parameter? 115 ('SEARCHSP_EFF', searchsp_eff), 116 ('SERVICE', service), 117 ('THRESHOLD', threshold), 118 ('UNGAPPED_ALIGNMENT', ungapped_alignment), 119 ('WORD_SIZE', word_size), 120 ('CMD', 'Put'), 121 ] 122 query = [x for x in parameters if x[1] is not None] 123 message = _as_bytes(_urlencode(query)) 124 125 # Send off the initial query to qblast. 126 # Note the NCBI do not currently impose a rate limit here, other 127 # than the request not to make say 50 queries at once using multiple 128 # threads. 129 request = _Request(url_base, 130 message, 131 {"User-Agent": "BiopythonClient"}) 132 handle = _urlopen(request) 133 134 # Format the "Get" command, which gets the formatted results from qblast 135 # Parameters taken from http://www.ncbi.nlm.nih.gov/BLAST/Doc/node6.html on 9 July 2007 136 rid, rtoe = _parse_qblast_ref_page(handle) 137 parameters = [ 138 ('ALIGNMENTS', alignments), 139 ('ALIGNMENT_VIEW', alignment_view), 140 ('DESCRIPTIONS', descriptions), 141 ('ENTREZ_LINKS_NEW_WINDOW', entrez_links_new_window), 142 ('EXPECT_LOW', expect_low), 143 ('EXPECT_HIGH', expect_high), 144 ('FORMAT_ENTREZ_QUERY', format_entrez_query), 145 ('FORMAT_OBJECT', format_object), 146 ('FORMAT_TYPE', format_type), 147 ('NCBI_GI', ncbi_gi), 148 ('RID', rid), 149 ('RESULTS_FILE', results_file), 150 ('SERVICE', service), 151 ('SHOW_OVERVIEW', show_overview), 152 ('CMD', 'Get'), 153 ] 154 query = [x for x in parameters if x[1] is not None] 155 message = _as_bytes(_urlencode(query)) 156 157 # Poll NCBI until the results are ready. Use a backoff delay from 2 - 120 second wait 158 delay = 2.0 159 previous = time.time() 160 while True: 161 current = time.time() 162 wait = previous + delay - current 163 if wait > 0: 164 time.sleep(wait) 165 previous = current + wait 166 else: 167 previous = current 168 if delay + .5 * delay <= 120: 169 delay += .5 * delay 170 else: 171 delay = 120 172 173 request = _Request(url_base, 174 message, 175 {"User-Agent": "BiopythonClient"}) 176 handle = _urlopen(request) 177 results = _as_string(handle.read()) 178 179 # Can see an "\n\n" page while results are in progress, 180 # if so just wait a bit longer... 181 if results == "\n\n": 182 continue 183 # XML results don't have the Status tag when finished 184 if "Status=" not in results: 185 break 186 i = results.index("Status=") 187 j = results.index("\n", i) 188 status = results[i + len("Status="):j].strip() 189 if status.upper() == "READY": 190 break 191 192 return StringIO(results)
193 194
195 -def _parse_qblast_ref_page(handle):
196 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE). 197 198 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is probably 199 'Request Time of Execution' and RID would be 'Request Identifier'. 200 """ 201 s = _as_string(handle.read()) 202 i = s.find("RID =") 203 if i == -1: 204 rid = None 205 else: 206 j = s.find("\n", i) 207 rid = s[i + len("RID ="):j].strip() 208 209 i = s.find("RTOE =") 210 if i == -1: 211 rtoe = None 212 else: 213 j = s.find("\n", i) 214 rtoe = s[i + len("RTOE ="):j].strip() 215 216 if not rid and not rtoe: 217 # Can we reliably extract the error message from the HTML page? 218 # e.g. "Message ID#24 Error: Failed to read the Blast query: 219 # Nucleotide FASTA provided for protein sequence" 220 # or "Message ID#32 Error: Query contains no data: Query 221 # contains no sequence data" 222 # 223 # This used to occur inside a <div class="error msInf"> entry: 224 i = s.find('<div class="error msInf">') 225 if i != -1: 226 msg = s[i + len('<div class="error msInf">'):].strip() 227 msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip() 228 if msg: 229 raise ValueError("Error message from NCBI: %s" % msg) 230 # In spring 2010 the markup was like this: 231 i = s.find('<p class="error">') 232 if i != -1: 233 msg = s[i + len('<p class="error">'):].strip() 234 msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip() 235 if msg: 236 raise ValueError("Error message from NCBI: %s" % msg) 237 # Generic search based on the way the error messages start: 238 i = s.find('Message ID#') 239 if i != -1: 240 # Break the message at the first HTML tag 241 msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip() 242 raise ValueError("Error message from NCBI: %s" % msg) 243 # We didn't recognise the error layout :( 244 # print s 245 raise ValueError("No RID and no RTOE found in the 'please wait' page, " 246 "there was probably an error in your request but we " 247 "could not extract a helpful error message.") 248 elif not rid: 249 # Can this happen? 250 raise ValueError("No RID found in the 'please wait' page." 251 " (although RTOE = %s)" % repr(rtoe)) 252 elif not rtoe: 253 # Can this happen? 254 raise ValueError("No RTOE found in the 'please wait' page." 255 " (although RID = %s)" % repr(rid)) 256 257 try: 258 return rid, int(rtoe) 259 except ValueError: 260 raise ValueError("A non-integer RTOE found in " 261 "the 'please wait' page, %s" % repr(rtoe))
262