Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15  Variables: 
 16   
 17      - email        Set the Entrez email parameter (default is not set). 
 18      - tool         Set the Entrez tool parameter (default is  biopython). 
 19   
 20  Functions: 
 21   
 22      - efetch       Retrieves records in the requested format from a list of one or 
 23        more primary IDs or from the user's environment 
 24      - epost        Posts a file containing a list of primary IDs for future use in 
 25        the user's environment to use with subsequent search strategies 
 26      - esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 27        and ESummary) and term translations and optionally retains 
 28        results for future use in the user's environment. 
 29      - elink        Checks for the existence of an external or Related Articles link 
 30        from a list of one or more primary IDs.  Retrieves primary IDs 
 31        and relevancy scores for links to Entrez databases or Related 
 32        Articles;  creates a hyperlink to the primary LinkOut provider 
 33        for a specific ID and database, or lists LinkOut URLs 
 34        and Attributes for multiple IDs. 
 35      - einfo        Provides field index term counts, last update, and available 
 36        links for each database. 
 37      - esummary     Retrieves document summaries from a list of primary IDs or from 
 38        the user's environment. 
 39      - egquery      Provides Entrez database counts in XML for a single search 
 40        using Global Query. 
 41      - espell       Retrieves spelling suggestions. 
 42      - ecitmatch    Retrieves PubMed IDs (PMIDs) that correspond to a set of 
 43        input citation strings. 
 44   
 45      - read         Parses the XML results returned by any of the above functions. 
 46        Typical usage is: 
 47   
 48            >>> from Bio import Entrez 
 49            >>> Entrez.email = "Your.Name.Here@example.org" 
 50            >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 51            >>> record = Entrez.read(handle) 
 52            >>> handle.close() 
 53   
 54         where record is now a Python dictionary or list. 
 55   
 56      - parse        Parses the XML results returned by those of the above functions 
 57        which can return multiple records - such as efetch, esummary 
 58        and elink. Typical usage is: 
 59   
 60            >>> handle = Entrez.efetch("pubmed", id="19304878,14630660", retmode="xml") 
 61            >>> records = Entrez.parse(handle) 
 62            >>> for record in records: 
 63            ...     # each record is a Python dictionary or list. 
 64            ...     print(record['MedlineCitation']['Article']['ArticleTitle']) 
 65            Biopython: freely available Python tools for computational molecular biology and bioinformatics. 
 66            PDB file parser and structure class implemented in Python. 
 67            >>> handle.close() 
 68   
 69        This function is appropriate only if the XML file contains 
 70        multiple records, and is particular useful for large files. 
 71   
 72      - _open        Internally used function. 
 73   
 74  """ 
 75  from __future__ import print_function 
 76   
 77  import time 
 78  import warnings 
 79  import os.path 
 80   
 81  # Importing these functions with leading underscore as not intended for reuse 
 82  from Bio._py3k import urlopen as _urlopen 
 83  from Bio._py3k import urlencode as _urlencode 
 84  from Bio._py3k import HTTPError as _HTTPError 
 85   
 86  from Bio._py3k import _binary_to_string_handle, _as_bytes 
 87   
 88  __docformat__ = "restructuredtext en" 
 89   
 90  email = None 
 91  tool = "biopython" 
 92   
 93   
 94  # XXX retmode? 
95 -def epost(db, **keywds):
96 """Post a file of identifiers for future use. 97 98 Posts a file containing a list of UIs for future use in the user's 99 environment to use with subsequent search strategies. 100 101 See the online documentation for an explanation of the parameters: 102 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 103 104 Return a handle to the results. 105 106 Raises an IOError exception if there's a network error. 107 """ 108 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 109 variables = {'db': db} 110 variables.update(keywds) 111 return _open(cgi, variables, post=True)
112 113
114 -def efetch(db, **keywords):
115 """Fetches Entrez results which are returned as a handle. 116 117 EFetch retrieves records in the requested format from a list of one or 118 more UIs or from user's environment. 119 120 See the online documentation for an explanation of the parameters: 121 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 122 123 Return a handle to the results. 124 125 Raises an IOError exception if there's a network error. 126 127 Short example: 128 129 >>> from Bio import Entrez 130 >>> Entrez.email = "Your.Name.Here@example.org" 131 >>> handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb", retmode="text") 132 >>> print(handle.readline().strip()) 133 LOCUS AY851612 892 bp DNA linear PLN 10-APR-2007 134 >>> handle.close() 135 136 **Warning:** The NCBI changed the default retmode in Feb 2012, so many 137 databases which previously returned text output now give XML. 138 """ 139 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 140 variables = {'db': db} 141 variables.update(keywords) 142 post = False 143 try: 144 ids = variables["id"] 145 except KeyError: 146 pass 147 else: 148 if isinstance(ids, list): 149 ids = ",".join(ids) 150 variables["id"] = ids 151 if ids.count(",") >= 200: 152 # NCBI prefers an HTTP POST instead of an HTTP GET if there are 153 # more than about 200 IDs 154 post = True 155 return _open(cgi, variables, post)
156 157
158 -def esearch(db, term, **keywds):
159 """ESearch runs an Entrez search and returns a handle to the results. 160 161 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 162 and ESummary) and term translations, and optionally retains results 163 for future use in the user's environment. 164 165 See the online documentation for an explanation of the parameters: 166 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 167 168 Return a handle to the results which are always in XML format. 169 170 Raises an IOError exception if there's a network error. 171 172 Short example: 173 174 >>> from Bio import Entrez 175 >>> Entrez.email = "Your.Name.Here@example.org" 176 >>> handle = Entrez.esearch(db="nucleotide", retmax=10, term="opuntia[ORGN] accD") 177 >>> record = Entrez.read(handle) 178 >>> handle.close() 179 >>> record["Count"] >= 2 180 True 181 >>> "156535671" in record["IdList"] 182 True 183 >>> "156535673" in record["IdList"] 184 True 185 186 """ 187 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 188 variables = {'db': db, 189 'term': term} 190 variables.update(keywds) 191 return _open(cgi, variables)
192 193 231 232
233 -def einfo(**keywds):
234 """EInfo returns a summary of the Entez databases as a results handle. 235 236 EInfo provides field names, index term counts, last update, and 237 available links for each Entrez database. 238 239 See the online documentation for an explanation of the parameters: 240 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 241 242 Return a handle to the results, by default in XML format. 243 244 Raises an IOError exception if there's a network error. 245 246 Short example: 247 248 >>> from Bio import Entrez 249 >>> Entrez.email = "Your.Name.Here@example.org" 250 >>> record = Entrez.read(Entrez.einfo()) 251 >>> 'pubmed' in record['DbList'] 252 True 253 254 """ 255 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 256 variables = {} 257 variables.update(keywds) 258 return _open(cgi, variables)
259 260
261 -def esummary(**keywds):
262 """ESummary retrieves document summaries as a results handle. 263 264 ESummary retrieves document summaries from a list of primary IDs or 265 from the user's environment. 266 267 See the online documentation for an explanation of the parameters: 268 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 269 270 Return a handle to the results, by default in XML format. 271 272 Raises an IOError exception if there's a network error. 273 274 This example discovers more about entry 30367 in the journals database: 275 276 >>> from Bio import Entrez 277 >>> Entrez.email = "Your.Name.Here@example.org" 278 >>> handle = Entrez.esummary(db="journals", id="30367") 279 >>> record = Entrez.read(handle) 280 >>> handle.close() 281 >>> print(record[0]["Id"]) 282 30367 283 >>> print(record[0]["Title"]) 284 Computational biology and chemistry 285 286 """ 287 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 288 variables = {} 289 variables.update(keywds) 290 return _open(cgi, variables)
291 292
293 -def egquery(**keywds):
294 """EGQuery provides Entrez database counts for a global search. 295 296 EGQuery provides Entrez database counts in XML for a single search 297 using Global Query. 298 299 See the online documentation for an explanation of the parameters: 300 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 301 302 Return a handle to the results in XML format. 303 304 Raises an IOError exception if there's a network error. 305 306 This quick example based on a longer version from the Biopython 307 Tutorial just checks there are over 60 matches for 'Biopython' 308 in PubMedCentral: 309 310 >>> from Bio import Entrez 311 >>> Entrez.email = "Your.Name.Here@example.org" 312 >>> handle = Entrez.egquery(term="biopython") 313 >>> record = Entrez.read(handle) 314 >>> handle.close() 315 >>> for row in record["eGQueryResult"]: 316 ... if "pmc" in row["DbName"]: 317 ... print(row["Count"] > 60) 318 True 319 320 """ 321 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 322 variables = {} 323 variables.update(keywds) 324 return _open(cgi, variables)
325 326
327 -def espell(**keywds):
328 """ESpell retrieves spelling suggestions, returned in a results handle. 329 330 ESpell retrieves spelling suggestions, if available. 331 332 See the online documentation for an explanation of the parameters: 333 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 334 335 Return a handle to the results, by default in XML format. 336 337 Raises an IOError exception if there's a network error. 338 339 Short example: 340 341 >>> from Bio import Entrez 342 >>> Entrez.email = "Your.Name.Here@example.org" 343 >>> record = Entrez.read(Entrez.espell(term="biopythooon")) 344 >>> print(record["Query"]) 345 biopythooon 346 >>> print(record["CorrectedQuery"]) 347 biopython 348 349 """ 350 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 351 variables = {} 352 variables.update(keywds) 353 return _open(cgi, variables)
354 355
356 -def ecitmatch(**keywds):
357 """ECitMatch retrieves PMIDs-Citation linking 358 359 ECitMatch retrieves PubMed IDs (PMIDs) that correspond to a set of input citation strings. 360 361 See the online documentation for an explanation of the parameters: 362 http://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ECitMatch 363 364 Return a handle to the results, by default in plain text 365 366 Raises an IOError exception if there's a network error. 367 368 Short example: 369 370 >>> from Bio import Entrez 371 >>> Entrez.email = "Your.Name.Here@example.org" 372 >>> citation_1 = { 373 ... "journal_title": "proc natl acad sci u s a", 374 ... "year": "1991", "volume": "88", "first_page": "3248", 375 ... "author_name": "mann bj", "key": "citation_1"} 376 >>> record = Entrez.ecitmatch(db="pubmed", bdata=[citation_1]) 377 >>> print(record["Query"]) 378 """ 379 cgi = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/ecitmatch.cgi' 380 # XML is the only supported value, and it actually returns TXT. 381 variables = {'retmode': 'xml'} 382 citation_keys = ('journal_title', 'year', 'volume', 'first_page', 'author_name', 'key') 383 384 # Accept pre-formatted strings 385 if isinstance(keywds['bdata'], str): 386 variables.update(keywds) 387 else: 388 # Alternatively accept a nicer interface 389 variables['db'] = keywds['db'] 390 bdata = [] 391 for citation in keywds['bdata']: 392 formatted_citation = '|'.join([citation.get(key, "") for key in citation_keys]) 393 bdata.append(formatted_citation) 394 variables['bdata'] = '\r'.join(bdata) 395 396 return _open(cgi, variables, ecitmatch=True)
397 398
399 -def read(handle, validate=True):
400 """Parses an XML file from the NCBI Entrez Utilities into python objects. 401 402 This function parses an XML file created by NCBI's Entrez Utilities, 403 returning a multilevel data structure of Python lists and dictionaries. 404 Most XML files returned by NCBI's Entrez Utilities can be parsed by 405 this function, provided its DTD is available. Biopython includes the 406 DTDs for most commonly used Entrez Utilities. 407 408 If validate is True (default), the parser will validate the XML file 409 against the DTD, and raise an error if the XML file contains tags that 410 are not represented in the DTD. If validate is False, the parser will 411 simply skip such tags. 412 413 Whereas the data structure seems to consist of generic Python lists, 414 dictionaries, strings, and so on, each of these is actually a class 415 derived from the base type. This allows us to store the attributes 416 (if any) of each element in a dictionary my_element.attributes, and 417 the tag name in my_element.tag. 418 """ 419 from .Parser import DataHandler 420 handler = DataHandler(validate) 421 record = handler.read(handle) 422 return record
423 424
425 -def parse(handle, validate=True):
426 """Parses an XML file from the NCBI Entrez Utilities into python objects. 427 428 This function parses an XML file created by NCBI's Entrez Utilities, 429 returning a multilevel data structure of Python lists and dictionaries. 430 This function is suitable for XML files that (in Python) can be represented 431 as a list of individual records. Whereas 'read' reads the complete file 432 and returns a single Python list, 'parse' is a generator function that 433 returns the records one by one. This function is therefore particularly 434 useful for parsing large files. 435 436 Most XML files returned by NCBI's Entrez Utilities can be parsed by 437 this function, provided its DTD is available. Biopython includes the 438 DTDs for most commonly used Entrez Utilities. 439 440 If validate is True (default), the parser will validate the XML file 441 against the DTD, and raise an error if the XML file contains tags that 442 are not represented in the DTD. If validate is False, the parser will 443 simply skip such tags. 444 445 Whereas the data structure seems to consist of generic Python lists, 446 dictionaries, strings, and so on, each of these is actually a class 447 derived from the base type. This allows us to store the attributes 448 (if any) of each element in a dictionary my_element.attributes, and 449 the tag name in my_element.tag. 450 """ 451 from .Parser import DataHandler 452 handler = DataHandler(validate) 453 records = handler.parse(handle) 454 return records
455 456
457 -def _open(cgi, params=None, post=False, ecitmatch=False):
458 """Helper function to build the URL and open a handle to it (PRIVATE). 459 460 Open a handle to Entrez. cgi is the URL for the cgi script to access. 461 params is a dictionary with the options to pass to it. Does some 462 simple error checking, and will raise an IOError if it encounters one. 463 464 This function also enforces the "up to three queries per second rule" 465 to avoid abusing the NCBI servers. 466 """ 467 if params is None: 468 params = {} 469 # NCBI requirement: At most three queries per second. 470 # Equivalently, at least a third of second between queries 471 delay = 0.333333334 472 current = time.time() 473 wait = _open.previous + delay - current 474 if wait > 0: 475 time.sleep(wait) 476 _open.previous = current + wait 477 else: 478 _open.previous = current 479 # Remove None values from the parameters 480 for key, value in list(params.items()): 481 if value is None: 482 del params[key] 483 # Tell Entrez that we are using Biopython (or whatever the user has 484 # specified explicitly in the parameters or by changing the default) 485 if "tool" not in params: 486 params["tool"] = tool 487 # Tell Entrez who we are 488 if "email" not in params: 489 if email is not None: 490 params["email"] = email 491 else: 492 warnings.warn(""" 493 Email address is not specified. 494 495 To make use of NCBI's E-utilities, NCBI requires you to specify your 496 email address with each request. As an example, if your email address 497 is A.N.Other@example.com, you can specify it as follows: 498 from Bio import Entrez 499 Entrez.email = 'A.N.Other@example.com' 500 In case of excessive usage of the E-utilities, NCBI will attempt to contact 501 a user at the email address provided before blocking access to the 502 E-utilities.""", UserWarning) 503 # Open a handle to Entrez. 504 options = _urlencode(params, doseq=True) 505 # _urlencode encodes pipes, which NCBI expects in ECitMatch 506 if ecitmatch: 507 options = options.replace('%7C', '|') 508 # print cgi + "?" + options 509 try: 510 if post: 511 # HTTP POST 512 handle = _urlopen(cgi, data=_as_bytes(options)) 513 else: 514 # HTTP GET 515 cgi += "?" + options 516 handle = _urlopen(cgi) 517 except _HTTPError as exception: 518 raise exception 519 520 return _binary_to_string_handle(handle)
521 522 _open.previous = 0 523 524
525 -def _test():
526 """Run the module's doctests (PRIVATE).""" 527 print("Running doctests...") 528 import doctest 529 doctest.testmod() 530 print("Done")
531 532 if __name__ == "__main__": 533 _test() 534