Package Bio :: Package TogoWS
[hide private]
[frames] | no frames]

Source Code for Package Bio.TogoWS

  1  # Copyright 2010-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Provides code to access the TogoWS integrated websevices of DBCLS, Japan. 
  7   
  8  This module aims to make the TogoWS (from DBCLS, Japan) easier to use. See: 
  9  http://togows.dbcls.jp/ 
 10   
 11  The TogoWS REST service provides simple access to a range of databases, acting 
 12  as a proxy to shield you from all the different provider APIs. This works using 
 13  simple URLs (which this module will construct for you). For more details, see 
 14  http://togows.dbcls.jp/site/en/rest.html 
 15   
 16  The functionality is somewhat similar to Biopython's Bio.Entrez module which 
 17  provides access to the NCBI's Entrez Utilities (E-Utils) which also covers a 
 18  wide range of databases. 
 19   
 20  Currently TogoWS does not provide any usage guidelines (unlike the NCBI whose 
 21  requirements are reasonably clear). To avoid risking overloading the service, 
 22  Biopython will only allow three calls per second. 
 23   
 24  The TogoWS SOAP service offers a more complex API for calling web services 
 25  (essentially calling remote functions) provided by DDBJ, KEGG and PDBj. For 
 26  example, this allows you to run a remote BLAST search at the DDBJ. This is 
 27  not yet covered by this module, however there are lots of Python examples 
 28  on the TogoWS website using the SOAPpy python library. See: 
 29  http://togows.dbcls.jp/site/en/soap.html 
 30  http://soapy.sourceforge.net/ 
 31  """ 
 32   
 33  from __future__ import print_function 
 34   
 35  import time 
 36  from Bio._py3k import _binary_to_string_handle, _as_bytes 
 37   
 38  # Importing these functions with leading underscore as not intended for reuse 
 39  from Bio._py3k import urlopen as _urlopen 
 40  from Bio._py3k import quote as _quote 
 41   
 42   
 43  # Constant 
 44  _BASE_URL = "http://togows.dbcls.jp" 
 45   
 46  # Caches: 
 47  _search_db_names = None 
 48  _entry_db_names = None 
 49  _entry_db_fields = {} 
 50  _entry_db_formats = {} 
 51  _convert_formats = [] 
 52   
 53   
54 -def _get_fields(url):
55 """Queries a TogoWS URL for a plain text list of values (PRIVATE).""" 56 handle = _open(url) 57 fields = handle.read().strip().split() 58 handle.close() 59 return fields
60 61
62 -def _get_entry_dbs():
63 return _get_fields(_BASE_URL + "/entry")
64 65
66 -def _get_entry_fields(db):
67 return _get_fields(_BASE_URL + "/entry/%s?fields" % db)
68 69
70 -def _get_entry_formats(db):
71 return _get_fields(_BASE_URL + "/entry/%s?formats" % db)
72 73
74 -def _get_convert_formats():
75 return [pair.split(".") for pair in 76 _get_fields(_BASE_URL + "/convert/")]
77 78
79 -def entry(db, id, format=None, field=None):
80 """TogoWS fetch entry (returns a handle). 81 82 - db - database (string), see list below. 83 - id - identier (string) or a list of identifiers (either as a list of 84 strings or a single string with comma separators). 85 - format - return data file format (string), options depend on the database 86 e.g. "xml", "json", "gff", "fasta", "ttl" (RDF Turtle) 87 - field - specific field from within the database record (string) 88 e.g. "au" or "authors" for pubmed. 89 90 At the time of writing, this includes the following:: 91 92 KEGG: compound, drug, enzyme, genes, glycan, orthology, reaction, 93 module, pathway 94 DDBj: ddbj, dad, pdb 95 NCBI: nuccore, nucest, nucgss, nucleotide, protein, gene, onim, 96 homologue, snp, mesh, pubmed 97 EBI: embl, uniprot, uniparc, uniref100, uniref90, uniref50 98 99 For the current list, please see http://togows.dbcls.jp/entry/ 100 101 This function is essentially equivalent to the NCBI Entrez service 102 EFetch, available in Biopython as Bio.Entrez.efetch(...), but that 103 does not offer field extraction. 104 """ 105 global _entry_db_names, _entry_db_fields, fetch_db_formats 106 if _entry_db_names is None: 107 _entry_db_names = _get_entry_dbs() 108 if db not in _entry_db_names: 109 raise ValueError("TogoWS entry fetch does not officially support " 110 "database '%s'." % db) 111 if field: 112 try: 113 fields = _entry_db_fields[db] 114 except KeyError: 115 fields = _get_entry_fields(db) 116 _entry_db_fields[db] = fields 117 if db == "pubmed" and field == "ti" and "title" in fields: 118 # Backwards compatibility fix for TogoWS change Nov/Dec 2013 119 field = "title" 120 import warnings 121 warnings.warn("TogoWS dropped 'pubmed' field alias 'ti', please use 'title' instead.") 122 if field not in fields: 123 raise ValueError("TogoWS entry fetch does not explicitly support " 124 "field '%s' for database '%s'. Only: %s" 125 % (field, db, ", ".join(sorted(fields)))) 126 if format: 127 try: 128 formats = _entry_db_formats[db] 129 except KeyError: 130 formats = _get_entry_formats(db) 131 _entry_db_formats[db] = formats 132 if format not in formats: 133 raise ValueError("TogoWS entry fetch does not explicitly support " 134 "format '%s' for database '%s'. Only: %s" 135 % (format, db, ", ".join(sorted(formats)))) 136 137 if isinstance(id, list): 138 id = ",".join(id) 139 url = _BASE_URL + "/entry/%s/%s" % (db, _quote(id)) 140 if field: 141 url += "/" + field 142 if format: 143 url += "." + format 144 return _open(url)
145 146
147 -def search_count(db, query):
148 """TogoWS search count (returns an integer). 149 150 db - database (string), see http://togows.dbcls.jp/search 151 query - search term (string) 152 153 You could then use the count to download a large set of search results in 154 batches using the offset and limit options to Bio.TogoWS.search(). In 155 general however the Bio.TogoWS.search_iter() function is simpler to use. 156 """ 157 global _search_db_names 158 if _search_db_names is None: 159 _search_db_names = _get_fields(_BASE_URL + "/search") 160 if db not in _search_db_names: 161 # TODO - Make this a ValueError? Right now despite the HTML website 162 # claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 163 import warnings 164 warnings.warn("TogoWS search does not officially support database '%s'. " 165 "See %s/search/ for options." % (db, _BASE_URL)) 166 url = _BASE_URL + "/search/%s/%s/count" % (db, _quote(query)) 167 handle = _open(url) 168 data = handle.read() 169 handle.close() 170 if not data: 171 raise ValueError("TogoWS returned no data from URL %s" % url) 172 try: 173 count = int(data.strip()) 174 except ValueError: 175 raise ValueError("Expected an integer from URL %s, got: %r" % (url, data)) 176 return count
177 178
179 -def search_iter(db, query, limit=None, batch=100):
180 """TogoWS search iteratating over the results (generator function). 181 182 - db - database (string), see http://togows.dbcls.jp/search 183 - query - search term (string) 184 - limit - optional upper bound on number of search results 185 - batch - number of search results to pull back each time talk to 186 TogoWS (currently limited to 100). 187 188 You would use this function within a for loop, e.g. 189 190 >>> for id in search_iter("pubmed", "lung+cancer+drug", limit=10): 191 ... print(id) # maybe fetch data with entry? 192 193 Internally this first calls the Bio.TogoWS.search_count() and then 194 uses Bio.TogoWS.search() to get the results in batches. 195 """ 196 count = search_count(db, query) 197 if not count: 198 raise StopIteration 199 # NOTE - We leave it to TogoWS to enforce any upper bound on each 200 # batch, they currently return an HTTP 400 Bad Request if above 100. 201 remain = count 202 if limit is not None: 203 remain = min(remain, limit) 204 offset = 1 # They don't use zero based counting 205 prev_ids = [] # Just cache the last batch for error checking 206 while remain: 207 batch = min(batch, remain) 208 # print("%r left, asking for %r" % (remain, batch)) 209 ids = search(db, query, offset, batch).read().strip().split() 210 assert len(ids) == batch, "Got %i, expected %i" % (len(ids), batch) 211 # print("offset %i, %s ... %s" % (offset, ids[0], ids[-1])) 212 if ids == prev_ids: 213 raise RuntimeError("Same search results for previous offset") 214 for identifier in ids: 215 if identifier in prev_ids: 216 raise RuntimeError("Result %s was in previous batch" 217 % identifier) 218 yield identifier 219 offset += batch 220 remain -= batch 221 prev_ids = ids
222 223
224 -def search(db, query, offset=None, limit=None, format=None):
225 """TogoWS search (returns a handle). 226 227 This is a low level wrapper for the TogoWS search function, which 228 can return results in a several formats. In general, the search_iter 229 function is more suitable for end users. 230 231 - db - database (string), see http://togows.dbcls.jp/search/ 232 - query - search term (string) 233 - offset, limit - optional integers specifying which result to start from 234 (1 based) and the number of results to return. 235 - format - return data file format (string), e.g. "json", "ttl" (RDF) 236 By default plain text is returned, one result per line. 237 238 At the time of writing, TogoWS applies a default count limit of 100 239 search results, and this is an upper bound. To access more results, 240 use the offset argument or the search_iter(...) function. 241 242 TogoWS supports a long list of databases, including many from the NCBI 243 (e.g. "ncbi-pubmed" or "pubmed", "ncbi-genbank" or "genbank", and 244 "ncbi-taxonomy"), EBI (e.g. "ebi-ebml" or "embl", "ebi-uniprot" or 245 "uniprot, "ebi-go"), and KEGG (e.g. "kegg-compound" or "compound"). 246 For the current list, see http://togows.dbcls.jp/search/ 247 248 The NCBI provide the Entrez Search service (ESearch) which is similar, 249 available in Biopython as the Bio.Entrez.esearch() function. 250 251 See also the function Bio.TogoWS.search_count() which returns the number 252 of matches found, and the Bio.TogoWS.search_iter() function which allows 253 you to iterate over the search results (taking care of batching for you). 254 """ 255 global _search_db_names 256 if _search_db_names is None: 257 _search_db_names = _get_fields(_BASE_URL + "/search") 258 if db not in _search_db_names: 259 # TODO - Make this a ValueError? Right now despite the HTML website 260 # claiming to, the "gene" or "ncbi-gene" don't work and are not listed. 261 import warnings 262 warnings.warn("TogoWS search does not explicitly support database '%s'. " 263 "See %s/search/ for options." % (db, _BASE_URL)) 264 url = _BASE_URL + "/search/%s/%s" % (db, _quote(query)) 265 if offset is not None and limit is not None: 266 try: 267 offset = int(offset) 268 except: 269 raise ValueError("Offset should be an integer (at least one), not %r" % offset) 270 try: 271 limit = int(limit) 272 except: 273 raise ValueError("Limit should be an integer (at least one), not %r" % limit) 274 if offset <= 0: 275 raise ValueError("Offset should be at least one, not %i" % offset) 276 if limit <= 0: 277 raise ValueError("Count should be at least one, not %i" % limit) 278 url += "/%i,%i" % (offset, limit) 279 elif offset is not None or limit is not None: 280 raise ValueError("Expect BOTH offset AND limit to be provided (or neither)") 281 if format: 282 url += "." + format 283 # print(url) 284 return _open(url)
285 286
287 -def convert(data, in_format, out_format):
288 """TogoWS convert (returns a handle). 289 290 data - string or handle containing input record(s) 291 in_format - string describing the input file format (e.g. "genbank") 292 out_format - string describing the requested output format (e.g. "fasta") 293 294 For a list of supported conversions (e.g. "genbank" to "fasta"), see 295 http://togows.dbcls.jp/convert/ 296 297 Note that Biopython has built in support for conversion of sequence and 298 alignnent file formats (functions Bio.SeqIO.convert and Bio.AlignIO.convert) 299 """ 300 global _convert_formats 301 if not _convert_formats: 302 _convert_formats = _get_convert_formats() 303 if [in_format, out_format] not in _convert_formats: 304 msg = "\n".join("%s -> %s" % tuple(pair) for pair in _convert_formats) 305 raise ValueError("Unsupported conversion. Choose from:\n%s" % msg) 306 url = _BASE_URL + "/convert/%s.%s" % (in_format, out_format) 307 # TODO - Should we just accept a string not a handle? What about a filename? 308 if hasattr(data, "read"): 309 # Handle 310 return _open(url, post=data.read()) 311 else: 312 # String 313 return _open(url, post=data)
314 315
316 -def _open(url, post=None):
317 """Helper function to build the URL and open a handle to it (PRIVATE). 318 319 Open a handle to TogoWS, will raise an IOError if it encounters an error. 320 321 In the absence of clear guidelines, this function enforces a limit of 322 "up to three queries per second" to avoid abusing the TogoWS servers. 323 """ 324 delay = 0.333333333 # one third of a second 325 current = time.time() 326 wait = _open.previous + delay - current 327 if wait > 0: 328 time.sleep(wait) 329 _open.previous = current + wait 330 else: 331 _open.previous = current 332 333 # print(url) 334 if post: 335 handle = _urlopen(url, _as_bytes(post)) 336 else: 337 handle = _urlopen(url) 338 339 # We now trust TogoWS to have set an HTTP error code, that 340 # suffices for my current unit tests. Previously we would 341 # examine the start of the data returned back. 342 return _binary_to_string_handle(handle)
343 344 _open.previous = 0 345