1
2
3
4
5
6
7
8
9 """
10 This module provides code to work with the WWW version of BLAST
11 provided by the NCBI.
12 http://blast.ncbi.nlm.nih.gov/
13
14 Functions:
15 qblast Do a BLAST search using the QBLAST API.
16 """
17
18 try:
19 from cStringIO import StringIO
20 except ImportError:
21 from StringIO import StringIO
22
23 from Bio._py3k import _as_string, _as_bytes
24
25
26 -def qblast(program, database, sequence,
27 auto_format=None,composition_based_statistics=None,
28 db_genetic_code=None,endpoints=None,entrez_query='(none)',
29 expect=10.0,filter=None,gapcosts=None,genetic_code=None,
30 hitlist_size=50,i_thresh=None,layout=None,lcase_mask=None,
31 matrix_name=None,nucl_penalty=None,nucl_reward=None,
32 other_advanced=None,perc_ident=None,phi_pattern=None,
33 query_file=None,query_believe_defline=None,query_from=None,
34 query_to=None,searchsp_eff=None,service=None,threshold=None,
35 ungapped_alignment=None,word_size=None,
36 alignments=500,alignment_view=None,descriptions=500,
37 entrez_links_new_window=None,expect_low=None,expect_high=None,
38 format_entrez_query=None,format_object=None,format_type='XML',
39 ncbi_gi=None,results_file=None,show_overview=None, megablast=None,
40 ):
41 """Do a BLAST search using the QBLAST server at NCBI.
42
43 Supports all parameters of the qblast API for Put and Get.
44 Some useful parameters:
45 program blastn, blastp, blastx, tblastn, or tblastx (lower case)
46 database Which database to search against (e.g. "nr").
47 sequence The sequence to search.
48 ncbi_gi TRUE/FALSE whether to give 'gi' identifier.
49 descriptions Number of descriptions to show. Def 500.
50 alignments Number of alignments to show. Def 500.
51 expect An expect value cutoff. Def 10.0.
52 matrix_name Specify an alt. matrix (PAM30, PAM70, BLOSUM80, BLOSUM45).
53 filter "none" turns off filtering. Default no filtering
54 format_type "HTML", "Text", "ASN.1", or "XML". Def. "XML".
55 entrez_query Entrez query to limit Blast search
56 hitlist_size Number of hits to return. Default 50
57 megablast TRUE/FALSE whether to use MEga BLAST algorithm (blastn only)
58 service plain, psi, phi, rpsblast, megablast (lower case)
59
60 This function does no checking of the validity of the parameters
61 and passes the values to the server as is. More help is available at:
62 http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
63
64 """
65 import urllib
66 import urllib2
67 import time
68
69 assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
70
71
72
73
74
75
76 parameters = [
77 ('AUTO_FORMAT',auto_format),
78 ('COMPOSITION_BASED_STATISTICS',composition_based_statistics),
79 ('DATABASE',database),
80 ('DB_GENETIC_CODE',db_genetic_code),
81 ('ENDPOINTS',endpoints),
82 ('ENTREZ_QUERY',entrez_query),
83 ('EXPECT',expect),
84 ('FILTER',filter),
85 ('GAPCOSTS',gapcosts),
86 ('GENETIC_CODE',genetic_code),
87 ('HITLIST_SIZE',hitlist_size),
88 ('I_THRESH',i_thresh),
89 ('LAYOUT',layout),
90 ('LCASE_MASK',lcase_mask),
91 ('MEGABLAST',megablast),
92 ('MATRIX_NAME',matrix_name),
93 ('NUCL_PENALTY',nucl_penalty),
94 ('NUCL_REWARD',nucl_reward),
95 ('OTHER_ADVANCED',other_advanced),
96 ('PERC_IDENT',perc_ident),
97 ('PHI_PATTERN',phi_pattern),
98 ('PROGRAM',program),
99
100 ('QUERY',sequence),
101 ('QUERY_FILE',query_file),
102 ('QUERY_BELIEVE_DEFLINE',query_believe_defline),
103 ('QUERY_FROM',query_from),
104 ('QUERY_TO',query_to),
105
106 ('SEARCHSP_EFF',searchsp_eff),
107 ('SERVICE',service),
108 ('THRESHOLD',threshold),
109 ('UNGAPPED_ALIGNMENT',ungapped_alignment),
110 ('WORD_SIZE',word_size),
111 ('CMD', 'Put'),
112 ]
113 query = [x for x in parameters if x[1] is not None]
114 message = _as_bytes(urllib.urlencode(query))
115
116
117
118
119
120 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
121 message,
122 {"User-Agent":"BiopythonClient"})
123 handle = urllib2.urlopen(request)
124
125
126
127 rid, rtoe = _parse_qblast_ref_page(handle)
128 parameters = [
129 ('ALIGNMENTS',alignments),
130 ('ALIGNMENT_VIEW',alignment_view),
131 ('DESCRIPTIONS',descriptions),
132 ('ENTREZ_LINKS_NEW_WINDOW',entrez_links_new_window),
133 ('EXPECT_LOW',expect_low),
134 ('EXPECT_HIGH',expect_high),
135 ('FORMAT_ENTREZ_QUERY',format_entrez_query),
136 ('FORMAT_OBJECT',format_object),
137 ('FORMAT_TYPE',format_type),
138 ('NCBI_GI',ncbi_gi),
139 ('RID',rid),
140 ('RESULTS_FILE',results_file),
141 ('SERVICE',service),
142 ('SHOW_OVERVIEW',show_overview),
143 ('CMD', 'Get'),
144 ]
145 query = [x for x in parameters if x[1] is not None]
146 message = _as_bytes(urllib.urlencode(query))
147
148
149 delay = 3.0
150 previous = time.time()
151 while True:
152 current = time.time()
153 wait = previous + delay - current
154 if wait > 0:
155 time.sleep(wait)
156 previous = current + wait
157 else:
158 previous = current
159
160 request = urllib2.Request("http://blast.ncbi.nlm.nih.gov/Blast.cgi",
161 message,
162 {"User-Agent":"BiopythonClient"})
163 handle = urllib2.urlopen(request)
164 results = _as_string(handle.read())
165
166
167
168 if results=="\n\n":
169 continue
170
171 if "Status=" not in results:
172 break
173 i = results.index("Status=")
174 j = results.index("\n", i)
175 status = results[i+len("Status="):j].strip()
176 if status.upper() == "READY":
177 break
178
179 return StringIO(results)
180
181
183 """Extract a tuple of RID, RTOE from the 'please wait' page (PRIVATE).
184
185 The NCBI FAQ pages use TOE for 'Time of Execution', so RTOE is proably
186 'Request Time of Execution' and RID would be 'Request Identifier'.
187 """
188 s = _as_string(handle.read())
189 i = s.find("RID =")
190 if i == -1:
191 rid = None
192 else:
193 j = s.find("\n", i)
194 rid = s[i+len("RID ="):j].strip()
195
196 i = s.find("RTOE =")
197 if i == -1:
198 rtoe = None
199 else:
200 j = s.find("\n", i)
201 rtoe = s[i+len("RTOE ="):j].strip()
202
203 if not rid and not rtoe:
204
205
206
207
208
209
210
211 i = s.find('<div class="error msInf">')
212 if i != -1:
213 msg = s[i+len('<div class="error msInf">'):].strip()
214 msg = msg.split("</div>",1)[0].split("\n",1)[0].strip()
215 if msg:
216 raise ValueError("Error message from NCBI: %s" % msg)
217
218 i = s.find('<p class="error">')
219 if i != -1:
220 msg = s[i+len('<p class="error">'):].strip()
221 msg = msg.split("</p>",1)[0].split("\n",1)[0].strip()
222 if msg:
223 raise ValueError("Error message from NCBI: %s" % msg)
224
225 i = s.find('Message ID#')
226 if i != -1:
227
228 msg = s[i:].split("<",1)[0].split("\n",1)[0].strip()
229 raise ValueError("Error message from NCBI: %s" % msg)
230
231
232 raise ValueError("No RID and no RTOE found in the 'please wait' page, "
233 "there was probably an error in your request but we "
234 "could not extract a helpful error message.")
235 elif not rid:
236
237 raise ValueError("No RID found in the 'please wait' page."
238 " (although RTOE = %s)" % repr(rtoe))
239 elif not rtoe:
240
241 raise ValueError("No RTOE found in the 'please wait' page."
242 " (although RID = %s)" % repr(rid))
243
244 try:
245 return rid, int(rtoe)
246 except ValueError:
247 raise ValueError("A non-integer RTOE found in "
248 +"the 'please wait' page, %s" % repr(rtoe))
249