Package Bio :: Package TogoWS
[hide private]
[frames] | no frames]

Source Code for Package Bio.TogoWS

  1  # Copyright 2010-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Provides code to access the TogoWS integrated websevices of DBCLS, Japan. 
  7   
  8  This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See: 
  9  http://togows.dbcls.jp/ 
 10   
 11  The TogoWS REST service provides simple access to a range of databases, acting 
 12  as a proxy to shield you from all the different provider APIs. This works using 
 13  simple URLs (which this module will construct for you). For more details, see 
 14  http://togows.dbcls.jp/site/en/rest.html 
 15   
 16  The functionality is somewhat similar to Biopython's Bio.Entrez module which 
 17  provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a 
 18  wide range of databases. 
 19   
 20  Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose 
 21  requirements are reasonably clear). To avoid risking overloading the service, 
 22  Biopython will only allow three calls per second. 
 23   
 24  The TogoWS SOAP service offers a more complex API for calling web services 
 25  (essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For 
 26  example, this allows you to run a remote BLAST search at the DDBJ. This is 
 27  not yet covered by this module, however there are lots of Python examples 
 28  on the TogoWS website using the SOAPpy python library. See: 
 29  http://togows.dbcls.jp/site/en/soap.html 
 30  http://soapy.sourceforge.net/ 
 31  """ 
 32   
 33  from __future__ import print_function 
 34   
 35  import time 
 36  from Bio._py3k import _binary_to_string_handle, _as_bytes 
 37   
 38  #Importing these functions with leading underscore as not intended for reuse 
 39  from Bio._py3k import urlopen as _urlopen 
 40  from Bio._py3k import quote as _quote 
 41   
 42  #Constant 
 43  _BASE_URL = "http://togows.dbcls.jp" 
 44   
 45  #Caches: 
 46  _search_db_names = None 
 47  _entry_db_names = None 
 48  _entry_db_fields = {} 
 49  _entry_db_formats = {} 
 50  _convert_formats = [] 
 51   
 52   
53 -def _get_fields(url):
54 """Queries a TogoWS URL for a plain text list of values (PRIVATE).""" 55 handle = _open(url) 56 fields = handle.read().strip().split() 57 handle.close() 58 return fields
59 60
61 -def _get_entry_dbs():
62 return _get_fields(_BASE_URL + "/entry")
63 64
65 -def _get_entry_fields(db):
66 return _get_fields(_BASE_URL + "/entry/%s?fields" % db)
67 68
69 -def _get_entry_formats(db):
70 return _get_fields(_BASE_URL + "/entry/%s?formats" % db)
71 72
73 -def _get_convert_formats():
74 return [pair.split(".") for pair in 75 _get_fields(_BASE_URL + "/convert/")]
76 77
78 -def entry(db, id, format=None, field=None):
79 """TogoWS fetch entry (returns a handle). 80 81 db - database (string), see list below. 82 id - identier (string) or a list of identifiers (either as a list of 83 strings or a single string with comma separators). 84 format - return data file format (string), options depend on the database 85 e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle) 86 field - specific field from within the database record (string) 87 e.g. "au" or "authors" for pubmed. 88 89 At the time of writing, this includes the following: 90 91 KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction, 92 module, pathway 93 DDBj: ddbj, dad, pdb 94 NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim, 95 homologue, snp, mesh, pubmed 96 EBI: embl, uniprot, uniparc, uniref100, uniref90, uniref50 97 98 For the current list, please see http://togows.dbcls.jp/entry/ 99 100 This function is essentially equivalent to the NCBI Entrez service 101 EFetch, available in Biopython as Bio.Entrez.efetch(...), but that 102 does not offer field extraction. 103 """ 104 global _entry_db_names, _entry_db_fields, fetch_db_formats 105 if _entry_db_names is None: 106 _entry_db_names = _get_entry_dbs() 107 if db not in _entry_db_names: 108 raise ValueError("TogoWS entry fetch does not officially support " 109 "database '%s'." % db) 110 if field: 111 try: 112 fields = _entry_db_fields[db] 113 except KeyError: 114 fields = _get_entry_fields(db) 115 _entry_db_fields[db] = fields 116 if db == "pubmed" and field == "ti" and "title" in fields: 117 #Backwards compatibility fix for TogoWS change Nov/Dec 2013 118 field = "title" 119 import warnings 120 warnings.warn("TogoWS dropped 'pubmed' field alias 'ti', please use 'title' instead.") 121 if field not in fields: 122 raise ValueError("TogoWS entry fetch does not explicitly support " 123 "field '%s' for database '%s'. Only: %s" 124 % (field, db, ", ".join(sorted(fields)))) 125 if format: 126 try: 127 formats = _entry_db_formats[db] 128 except KeyError: 129 formats = _get_entry_formats(db) 130 _entry_db_formats[db] = formats 131 if format not in formats: 132 raise ValueError("TogoWS entry fetch does not explicitly support " 133 "format '%s' for database '%s'. Only: %s" 134 % (format, db, ", ".join(sorted(formats)))) 135 136 if isinstance(id, list): 137 id = ",".join(id) 138 url = _BASE_URL + "/entry/%s/%s" % (db, _quote(id)) 139 if field: 140 url += "/" + field 141 if format: 142 url += "." + format 143 return _open(url)
144 145
146 -def search_count(db, query):
147 """TogoWS search count (returns an integer). 148 149 db - database (string), see http://togows.dbcls.jp/search 150 query - search term (string) 151 152 You could then use the count to download a large set of search results in 153 batches using the offset and limit options to Bio.TogoWS.search(). In 154 general however the Bio.TogoWS.search_iter() function is simpler to use. 155 """ 156 global _search_db_names 157 if _search_db_names is None: 158 _search_db_names = _get_fields(_BASE_URL + "/search") 159 if db not in _search_db_names: 160 #TODO - Make this a ValueError? Right now despite the HTML website 161 #claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 162 import warnings 163 warnings.warn("TogoWS search does not officially support database '%s'. " 164 "See %s/search/ for options." % (db, _BASE_URL)) 165 url = _BASE_URL + "/search/%s/%s/count" % (db, _quote(query)) 166 handle = _open(url) 167 data = handle.read() 168 handle.close() 169 try: 170 count = int(data.strip()) 171 except ValueError: 172 raise ValueError("Expected an integer from URL %s, got: %r" % (url, data)) 173 return count
174 175
176 -def search_iter(db, query, limit=None, batch=100):
177 """TogoWS search iteratating over the results (generator function). 178 179 db - database (string), see http://togows.dbcls.jp/search 180 query - search term (string) 181 limit - optional upper bound on number of search results 182 batch - number of search results to pull back each time talk to 183 TogoWS (currently limited to 100). 184 185 You would use this function within a for loop, e.g. 186 187 >>> for id in search_iter("pubmed", "lung+cancer+drug", limit=10): 188 ... print(id) #maybe fetch data with entry? 189 190 Internally this first calls the Bio.TogoWS.search_count() and then 191 uses Bio.TogoWS.search() to get the results in batches. 192 """ 193 count = search_count(db, query) 194 if not count: 195 raise StopIteration 196 #NOTE - We leave it to TogoWS to enforce any upper bound on each 197 #batch, they currently return an HTTP 400 Bad Request if above 100. 198 remain = count 199 if limit is not None: 200 remain = min(remain, limit) 201 offset = 1 # They don't use zero based counting 202 prev_ids = [] # Just cache the last batch for error checking 203 while remain: 204 batch = min(batch, remain) 205 #print("%r left, asking for %r" % (remain, batch)) 206 ids = search(db, query, offset, batch).read().strip().split() 207 assert len(ids) == batch, "Got %i, expected %i" % (len(ids), batch) 208 #print("offset %i, %s ... %s" % (offset, ids[0], ids[-1])) 209 if ids == prev_ids: 210 raise RuntimeError("Same search results for previous offset") 211 for identifier in ids: 212 if identifier in prev_ids: 213 raise RuntimeError("Result %s was in previous batch" 214 % identifier) 215 yield identifier 216 offset += batch 217 remain -= batch 218 prev_ids = ids
219 220
221 -def search(db, query, offset=None, limit=None, format=None):
222 """TogoWS search (returns a handle). 223 224 This is a low level wrapper for the TogoWS search function, which 225 can return results in a several formats. In general, the search_iter 226 function is more suitable for end users. 227 228 db - database (string), see http://togows.dbcls.jp/search/ 229 query - search term (string) 230 offset, limit - optional integers specifying which result to start from 231 (1 based) and the number of results to return. 232 format - return data file format (string), e.g. "json", "ttl" (RDF) 233 By default plain text is returned, one result per line. 234 235 At the time of writing, TogoWS applies a default count limit of 100 236 search results, and this is an upper bound. To access more results, 237 use the offset argument or the search_iter(...) function. 238 239 TogoWS supports a long list of databases, including many from the NCBI 240 (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and 241 "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or 242 "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound"). 243 For the current list, see http://togows.dbcls.jp/search/ 244 245 The NCBI provide the Entrez Search service (ESearch) which is similar, 246 available in Biopython as the Bio.Entrez.esearch() function. 247 248 See also the function Bio.TogoWS.search_count() which returns the number 249 of matches found, and the Bio.TogoWS.search_iter() function which allows 250 you to iterate over the search results (taking care of batching for you). 251 """ 252 global _search_db_names 253 if _search_db_names is None: 254 _search_db_names = _get_fields(_BASE_URL + "/search") 255 if db not in _search_db_names: 256 #TODO - Make this a ValueError? Right now despite the HTML website 257 #claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 258 import warnings 259 warnings.warn("TogoWS search does not explicitly support database '%s'. " 260 "See %s/search/ for options." % (db, _BASE_URL)) 261 url = _BASE_URL + "/search/%s/%s" % (db, _quote(query)) 262 if offset is not None and limit is not None: 263 try: 264 offset = int(offset) 265 except: 266 raise ValueError("Offset should be an integer (at least one), not %r" % offset) 267 try: 268 limit = int(limit) 269 except: 270 raise ValueError("Limit should be an integer (at least one), not %r" % limit) 271 if offset <= 0: 272 raise ValueError("Offset should be at least one, not %i" % offset) 273 if limit <= 0: 274 raise ValueError("Count should be at least one, not %i" % limit) 275 url += "/%i,%i" % (offset, limit) 276 elif offset is not None or limit is not None: 277 raise ValueError("Expect BOTH offset AND limit to be provided (or neither)") 278 if format: 279 url += "." + format 280 #print(url) 281 return _open(url)
282 283
284 -def convert(data, in_format, out_format):
285 """TogoWS convert (returns a handle). 286 287 data - string or handle containing input record(s) 288 in_format - string describing the input file format (e.g. "genbank") 289 out_format - string describing the requested output format (e.g. "fasta") 290 291 For a list of supported conversions (e.g. "genbank" to "fasta"), see 292 http://togows.dbcls.jp/convert/ 293 294 Note that Biopython has built in support for conversion of sequence and 295 alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert) 296 """ 297 global _convert_formats 298 if not _convert_formats: 299 _convert_formats = _get_convert_formats() 300 if [in_format, out_format] not in _convert_formats: 301 msg = "\n".join("%s -> %s" % tuple(pair) for pair in _convert_formats) 302 raise ValueError("Unsupported conversion. Choose from:\n%s" % msg) 303 url = _BASE_URL + "/convert/%s.%s" % (in_format, out_format) 304 #TODO - Should we just accept a string not a handle? What about a filename? 305 if hasattr(data, "read"): 306 #Handle 307 return _open(url, post=data.read()) 308 else: 309 #String 310 return _open(url, post=data)
311 312
313 -def _open(url, post=None):
314 """Helper function to build the URL and open a handle to it (PRIVATE). 315 316 Open a handle to TogoWS, will raise an IOError if it encounters an error. 317 318 In the absense of clear guidelines, this function enforces a limit of 319 "up to three queries per second" to avoid abusing the TogoWS servers. 320 """ 321 delay = 0.333333333 # one third of a second 322 current = time.time() 323 wait = _open.previous + delay - current 324 if wait > 0: 325 time.sleep(wait) 326 _open.previous = current + wait 327 else: 328 _open.previous = current 329 330 #print(url) 331 if post: 332 handle = _urlopen(url, _as_bytes(post)) 333 else: 334 handle = _urlopen(url) 335 336 #We now trust TogoWS to have set an HTTP error code, that 337 #suffices for my current unit tests. Previously we would 338 #examine the start of the data returned back. 339 return _binary_to_string_handle(handle)
340 341 _open.previous = 0 342