Package Bio :: Package TogoWS
[hide private]
[frames] | no frames]

Source Code for Package Bio.TogoWS

  1  # Copyright 2010-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Provides code to access the TogoWS integrated websevices of DBCLS, Japan. 
  7   
  8  This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See: 
  9  http://togows.dbcls.jp/ 
 10   
 11  The TogoWS REST service provides simple access to a range of databases, acting 
 12  as a proxy to shield you from all the different provider APIs. This works using 
 13  simple URLs (which this module will construct for you). For more details, see 
 14  http://togows.dbcls.jp/site/en/rest.html 
 15   
 16  The functionality is somewhat similar to Biopython's Bio.Entrez module which 
 17  provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a 
 18  wide range of databases. 
 19   
 20  Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose 
 21  requirements are reasonably clear). To avoid risking overloading the service, 
 22  Biopython will only allow three calls per second. 
 23   
 24  The TogoWS SOAP service offers a more complex API for calling web services 
 25  (essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For 
 26  example, this allows you to run a remote BLAST search at the DDBJ. This is 
 27  not yet covered by this module, however there are lots of Python examples 
 28  on the TogoWS website using the SOAPpy python library. See: 
 29  http://togows.dbcls.jp/site/en/soap.html 
 30  http://soapy.sourceforge.net/ 
 31  """ 
 32   
 33  from __future__ import print_function 
 34   
 35  import time 
 36  from Bio._py3k import _binary_to_string_handle, _as_bytes 
 37   
 38  # Importing these functions with leading underscore as not intended for reuse 
 39  from Bio._py3k import urlopen as _urlopen 
 40  from Bio._py3k import quote as _quote 
 41   
 42   
 43  # Constant 
 44  _BASE_URL = "http://togows.dbcls.jp" 
 45   
 46  # Caches: 
 47  _search_db_names = None 
 48  _entry_db_names = None 
 49  _entry_db_fields = {} 
 50  _entry_db_formats = {} 
 51  _convert_formats = [] 
 52   
 53   
54 -def _get_fields(url):
55 """Query a TogoWS URL for a plain text list of values (PRIVATE).""" 56 handle = _open(url) 57 fields = handle.read().strip().split() 58 handle.close() 59 return fields
60 61
62 -def _get_entry_dbs():
63 return _get_fields(_BASE_URL + "/entry")
64 65
66 -def _get_entry_fields(db):
67 return _get_fields(_BASE_URL + "/entry/%s?fields" % db)
68 69
70 -def _get_entry_formats(db):
71 return _get_fields(_BASE_URL + "/entry/%s?formats" % db)
72 73
74 -def _get_convert_formats():
75 return [pair.split(".") for pair in 76 _get_fields(_BASE_URL + "/convert/")]
77 78
79 -def entry(db, id, format=None, field=None):
80 """Call TogoWS 'entry' to fetch a record. 81 82 Arguments: 83 - db - database (string), see list below. 84 - id - identier (string) or a list of identifiers (either as a list of 85 strings or a single string with comma separators). 86 - format - return data file format (string), options depend on the database 87 e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle) 88 - field - specific field from within the database record (string) 89 e.g. "au" or "authors" for pubmed. 90 91 At the time of writing, this includes the following:: 92 93 KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction, 94 module, pathway 95 DDBj: ddbj, dad, pdb 96 NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim, 97 homologue, snp, mesh, pubmed 98 EBI: embl, uniprot, uniparc, uniref100, uniref90, uniref50 99 100 For the current list, please see http://togows.dbcls.jp/entry/ 101 102 This function is essentially equivalent to the NCBI Entrez service 103 EFetch, available in Biopython as Bio.Entrez.efetch(...), but that 104 does not offer field extraction. 105 """ 106 global _entry_db_names, _entry_db_fields, fetch_db_formats 107 if _entry_db_names is None: 108 _entry_db_names = _get_entry_dbs() 109 if db not in _entry_db_names: 110 raise ValueError("TogoWS entry fetch does not officially support " 111 "database '%s'." % db) 112 if field: 113 try: 114 fields = _entry_db_fields[db] 115 except KeyError: 116 fields = _get_entry_fields(db) 117 _entry_db_fields[db] = fields 118 if db == "pubmed" and field == "ti" and "title" in fields: 119 # Backwards compatibility fix for TogoWS change Nov/Dec 2013 120 field = "title" 121 import warnings 122 warnings.warn("TogoWS dropped 'pubmed' field alias 'ti', please use 'title' instead.") 123 if field not in fields: 124 raise ValueError("TogoWS entry fetch does not explicitly support " 125 "field '%s' for database '%s'. Only: %s" 126 % (field, db, ", ".join(sorted(fields)))) 127 if format: 128 try: 129 formats = _entry_db_formats[db] 130 except KeyError: 131 formats = _get_entry_formats(db) 132 _entry_db_formats[db] = formats 133 if format not in formats: 134 raise ValueError("TogoWS entry fetch does not explicitly support " 135 "format '%s' for database '%s'. Only: %s" 136 % (format, db, ", ".join(sorted(formats)))) 137 138 if isinstance(id, list): 139 id = ",".join(id) 140 url = _BASE_URL + "/entry/%s/%s" % (db, _quote(id)) 141 if field: 142 url += "/" + field 143 if format: 144 url += "." + format 145 return _open(url)
146 147
148 -def search_count(db, query):
149 """Call TogoWS search count to see how many matches a search gives. 150 151 Arguments: 152 - db - database (string), see http://togows.dbcls.jp/search 153 - query - search term (string) 154 155 You could then use the count to download a large set of search results in 156 batches using the offset and limit options to Bio.TogoWS.search(). In 157 general however the Bio.TogoWS.search_iter() function is simpler to use. 158 """ 159 global _search_db_names 160 if _search_db_names is None: 161 _search_db_names = _get_fields(_BASE_URL + "/search") 162 if db not in _search_db_names: 163 # TODO - Make this a ValueError? Right now despite the HTML website 164 # claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 165 import warnings 166 warnings.warn("TogoWS search does not officially support database '%s'. " 167 "See %s/search/ for options." % (db, _BASE_URL)) 168 url = _BASE_URL + "/search/%s/%s/count" % (db, _quote(query)) 169 handle = _open(url) 170 data = handle.read() 171 handle.close() 172 if not data: 173 raise ValueError("TogoWS returned no data from URL %s" % url) 174 try: 175 count = int(data.strip()) 176 except ValueError: 177 raise ValueError("Expected an integer from URL %s, got: %r" % (url, data)) 178 return count
179 180
181 -def search_iter(db, query, limit=None, batch=100):
182 """Call TogoWS search iteratating over the results (generator function). 183 184 Arguments: 185 - db - database (string), see http://togows.dbcls.jp/search 186 - query - search term (string) 187 - limit - optional upper bound on number of search results 188 - batch - number of search results to pull back each time talk to 189 TogoWS (currently limited to 100). 190 191 You would use this function within a for loop, e.g. 192 193 >>> for id in search_iter("pubmed", "lung+cancer+drug", limit=10): 194 ... print(id) # maybe fetch data with entry? 195 196 Internally this first calls the Bio.TogoWS.search_count() and then 197 uses Bio.TogoWS.search() to get the results in batches. 198 """ 199 count = search_count(db, query) 200 if not count: 201 raise StopIteration 202 # NOTE - We leave it to TogoWS to enforce any upper bound on each 203 # batch, they currently return an HTTP 400 Bad Request if above 100. 204 remain = count 205 if limit is not None: 206 remain = min(remain, limit) 207 offset = 1 # They don't use zero based counting 208 prev_ids = [] # Just cache the last batch for error checking 209 while remain: 210 batch = min(batch, remain) 211 # print("%r left, asking for %r" % (remain, batch)) 212 ids = search(db, query, offset, batch).read().strip().split() 213 assert len(ids) == batch, "Got %i, expected %i" % (len(ids), batch) 214 # print("offset %i, %s ... %s" % (offset, ids[0], ids[-1])) 215 if ids == prev_ids: 216 raise RuntimeError("Same search results for previous offset") 217 for identifier in ids: 218 if identifier in prev_ids: 219 raise RuntimeError("Result %s was in previous batch" 220 % identifier) 221 yield identifier 222 offset += batch 223 remain -= batch 224 prev_ids = ids
225 226
227 -def search(db, query, offset=None, limit=None, format=None):
228 """Call TogoWS search. 229 230 This is a low level wrapper for the TogoWS search function, which 231 can return results in a several formats. In general, the search_iter 232 function is more suitable for end users. 233 234 Arguments: 235 - db - database (string), see http://togows.dbcls.jp/search/ 236 - query - search term (string) 237 - offset, limit - optional integers specifying which result to start from 238 (1 based) and the number of results to return. 239 - format - return data file format (string), e.g. "json", "ttl" (RDF) 240 By default plain text is returned, one result per line. 241 242 At the time of writing, TogoWS applies a default count limit of 100 243 search results, and this is an upper bound. To access more results, 244 use the offset argument or the search_iter(...) function. 245 246 TogoWS supports a long list of databases, including many from the NCBI 247 (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and 248 "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or 249 "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound"). 250 For the current list, see http://togows.dbcls.jp/search/ 251 252 The NCBI provide the Entrez Search service (ESearch) which is similar, 253 available in Biopython as the Bio.Entrez.esearch() function. 254 255 See also the function Bio.TogoWS.search_count() which returns the number 256 of matches found, and the Bio.TogoWS.search_iter() function which allows 257 you to iterate over the search results (taking care of batching for you). 258 """ 259 global _search_db_names 260 if _search_db_names is None: 261 _search_db_names = _get_fields(_BASE_URL + "/search") 262 if db not in _search_db_names: 263 # TODO - Make this a ValueError? Right now despite the HTML website 264 # claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 265 import warnings 266 warnings.warn("TogoWS search does not explicitly support database '%s'. " 267 "See %s/search/ for options." % (db, _BASE_URL)) 268 url = _BASE_URL + "/search/%s/%s" % (db, _quote(query)) 269 if offset is not None and limit is not None: 270 try: 271 offset = int(offset) 272 except ValueError: 273 raise ValueError("Offset should be an integer (at least one), not %r" % offset) 274 try: 275 limit = int(limit) 276 except ValueError: 277 raise ValueError("Limit should be an integer (at least one), not %r" % limit) 278 if offset <= 0: 279 raise ValueError("Offset should be at least one, not %i" % offset) 280 if limit <= 0: 281 raise ValueError("Count should be at least one, not %i" % limit) 282 url += "/%i,%i" % (offset, limit) 283 elif offset is not None or limit is not None: 284 raise ValueError("Expect BOTH offset AND limit to be provided (or neither)") 285 if format: 286 url += "." + format 287 # print(url) 288 return _open(url)
289 290
291 -def convert(data, in_format, out_format):
292 """Call TogoWS for file format convertion. 293 294 Arguments: 295 - data - string or handle containing input record(s) 296 - in_format - string describing the input file format (e.g. "genbank") 297 - out_format - string describing the requested output format (e.g. "fasta") 298 299 For a list of supported conversions (e.g. "genbank" to "fasta"), see 300 http://togows.dbcls.jp/convert/ 301 302 Note that Biopython has built in support for conversion of sequence and 303 alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert) 304 """ 305 global _convert_formats 306 if not _convert_formats: 307 _convert_formats = _get_convert_formats() 308 if [in_format, out_format] not in _convert_formats: 309 msg = "\n".join("%s -> %s" % tuple(pair) for pair in _convert_formats) 310 raise ValueError("Unsupported conversion. Choose from:\n%s" % msg) 311 url = _BASE_URL + "/convert/%s.%s" % (in_format, out_format) 312 # TODO - Should we just accept a string not a handle? What about a filename? 313 if hasattr(data, "read"): 314 # Handle 315 return _open(url, post=data.read()) 316 else: 317 # String 318 return _open(url, post=data)
319 320
321 -def _open(url, post=None):
322 """Build the URL and open a handle to it (PRIVATE). 323 324 Open a handle to TogoWS, will raise an IOError if it encounters an error. 325 326 In the absence of clear guidelines, this function enforces a limit of 327 "up to three queries per second" to avoid abusing the TogoWS servers. 328 """ 329 delay = 0.333333333 # one third of a second 330 current = time.time() 331 wait = _open.previous + delay - current 332 if wait > 0: 333 time.sleep(wait) 334 _open.previous = current + wait 335 else: 336 _open.previous = current 337 338 # print(url) 339 if post: 340 handle = _urlopen(url, _as_bytes(post)) 341 else: 342 handle = _urlopen(url) 343 344 # We now trust TogoWS to have set an HTTP error code, that 345 # suffices for my current unit tests. Previously we would 346 # examine the start of the data returned back. 347 return _binary_to_string_handle(handle)
348 349 350 _open.previous = 0 351