Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008-2014 by Michiel de Hoon.  All rights reserved. 
  2  # Revisions copyright 2008-2015 by Peter Cock. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Parser for XML results returned by NCBI's Entrez Utilities. 
  8   
  9  This parser is used by the read() function in Bio.Entrez, and is not 
 10  intended be used directly. 
 11   
 12  The question is how to represent an XML file as Python objects. Some 
 13  XML files returned by NCBI look like lists, others look like dictionaries, 
 14  and others look like a mix of lists and dictionaries. 
 15   
 16  My approach is to classify each possible element in the XML as a plain 
 17  string, an integer, a list, a dictionary, or a structure. The latter is a 
 18  dictionary where the same key can occur multiple times; in Python, it is 
 19  represented as a dictionary where that key occurs once, pointing to a list 
 20  of values found in the XML file. 
 21   
 22  The parser then goes through the XML and creates the appropriate Python 
 23  object for each element. The different levels encountered in the XML are 
 24  preserved on the Python side. So a subelement of a subelement of an element 
 25  is a value in a dictionary that is stored in a list which is a value in 
 26  some other dictionary (or a value in a list which itself belongs to a list 
 27  which is a value in a dictionary, and so on). Attributes encountered in 
 28  the XML are stored as a dictionary in a member .attributes of each element, 
 29  and the tag name is saved in a member .tag. 
 30   
 31  To decide which kind of Python object corresponds to each element in the 
 32  XML, the parser analyzes the DTD referred at the top of (almost) every 
 33  XML file returned by the Entrez Utilities. This is preferred over a hand- 
 34  written solution, since the number of DTDs is rather large and their 
 35  contents may change over time. About half the code in this parser deals 
 36  with parsing the DTD, and the other half with the XML itself. 
 37  """ 
 38  import sys 
 39  import re 
 40  import os 
 41  import warnings 
 42  from xml.parsers import expat 
 43  from io import BytesIO 
 44  import xml.etree.ElementTree as ET 
 45   
 46  # Importing these functions with leading underscore as not intended for reuse 
 47  from Bio._py3k import urlopen as _urlopen 
 48  from Bio._py3k import urlparse as _urlparse 
 49  from Bio._py3k import unicode 
 50   
 51   
 52  # The following four classes are used to add a member .attributes to integers, 
 53  # strings, lists, and dictionaries, respectively. 
 54   
 55   
56 -class IntegerElement(int):
57 - def __repr__(self):
58 text = int.__repr__(self) 59 try: 60 attributes = self.attributes 61 except AttributeError: 62 return text 63 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
64 65
66 -class StringElement(str):
67 - def __repr__(self):
68 text = str.__repr__(self) 69 try: 70 attributes = self.attributes 71 except AttributeError: 72 return text 73 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
74 75
76 -class UnicodeElement(unicode):
77 - def __repr__(self):
78 text = unicode.__repr__(self) 79 try: 80 attributes = self.attributes 81 except AttributeError: 82 return text 83 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
84 85
86 -class ListElement(list):
87 - def __repr__(self):
88 text = list.__repr__(self) 89 try: 90 attributes = self.attributes 91 except AttributeError: 92 return text 93 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
94 95
96 -class DictionaryElement(dict):
97 - def __repr__(self):
98 text = dict.__repr__(self) 99 try: 100 attributes = self.attributes 101 except AttributeError: 102 return text 103 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
104 105 106 # A StructureElement is like a dictionary, but some of its keys can have 107 # multiple values associated with it. These values are stored in a list 108 # under each key.
109 -class StructureElement(dict):
110 - def __init__(self, keys):
111 dict.__init__(self) 112 for key in keys: 113 dict.__setitem__(self, key, []) 114 self.listkeys = keys
115
116 - def __setitem__(self, key, value):
117 if key in self.listkeys: 118 self[key].append(value) 119 else: 120 dict.__setitem__(self, key, value)
121
122 - def __repr__(self):
123 text = dict.__repr__(self) 124 try: 125 attributes = self.attributes 126 except AttributeError: 127 return text 128 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
129 130
131 -class NotXMLError(ValueError):
132 - def __init__(self, message):
133 self.msg = message
134
135 - def __str__(self):
136 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
137 138
139 -class CorruptedXMLError(ValueError):
140 - def __init__(self, message):
141 self.msg = message
142
143 - def __str__(self):
144 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
145 146
147 -class ValidationError(ValueError):
148 """XML tag found which was not defined in the DTD. 149 150 Validating parsers raise this error if the parser finds a tag in the XML 151 that is not defined in the DTD. Non-validating parsers do not raise this 152 error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating 153 parsers by default (see those functions for more information). 154 """ 155
156 - def __init__(self, name):
157 self.name = name
158
159 - def __str__(self):
160 return ("Failed to find tag '%s' in the DTD. To skip all tags that " 161 "are not represented in the DTD, please call Bio.Entrez.read " 162 "or Bio.Entrez.parse with validate=False." % self.name)
163 164
165 -class DataHandler(object):
166 167 import platform 168 if platform.system() == 'Windows': 169 directory = os.path.join(os.getenv("APPDATA"), "biopython") 170 else: # Unix/Linux/Mac 171 home = os.path.expanduser('~') 172 directory = os.path.join(home, '.config', 'biopython') 173 del home 174 local_dtd_dir = os.path.join(directory, 'Bio', 'Entrez', 'DTDs') 175 local_xsd_dir = os.path.join(directory, 'Bio', 'Entrez', 'XSDs') 176 del directory 177 del platform 178 try: 179 os.makedirs(local_dtd_dir) # use exist_ok=True on Python >= 3.2 180 except OSError as exception: 181 # Check if local_dtd_dir already exists, and that it is a directory. 182 # Trying os.makedirs first and then checking for os.path.isdir avoids 183 # a race condition. 184 if not os.path.isdir(local_dtd_dir): 185 raise exception 186 try: 187 os.makedirs(local_xsd_dir) # use exist_ok=True on Python >= 3.2 188 except OSError as exception: 189 if not os.path.isdir(local_xsd_dir): 190 raise exception 191 192 from Bio import Entrez 193 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 194 global_xsd_dir = os.path.join(str(Entrez.__path__[0]), "XSDs") 195 del Entrez 196
197 - def __init__(self, validate):
198 self.stack = [] 199 self.errors = [] 200 self.integers = [] 201 self.strings = [] 202 self.lists = [] 203 self.dictionaries = [] 204 self.structures = {} 205 self.items = [] 206 self.dtd_urls = [] 207 self.validating = validate 208 self.parser = expat.ParserCreate(namespace_separator=" ") 209 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 210 self.parser.XmlDeclHandler = self.xmlDeclHandler 211 self.is_schema = False
212
213 - def read(self, handle):
214 """Set up the parser and let it parse the XML results""" 215 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 216 # expects binary data 217 if handle.__class__.__name__ == 'EvilHandleHack': 218 handle = handle._handle 219 if handle.__class__.__name__ == 'TextIOWrapper': 220 handle = handle.buffer 221 if hasattr(handle, "closed") and handle.closed: 222 # Should avoid a possible Segmentation Fault, see: 223 # http://bugs.python.org/issue4877 224 raise IOError("Can't parse a closed handle") 225 if sys.version_info[0] >= 3: 226 # Another nasty hack to cope with a unicode StringIO handle 227 # since the Entrez XML parser expects binary data (bytes) 228 from io import StringIO 229 if isinstance(handle, StringIO): 230 from Bio._py3k import _as_bytes 231 handle = BytesIO(_as_bytes(handle.read())) 232 try: 233 self.parser.ParseFile(handle) 234 except expat.ExpatError as e: 235 if self.parser.StartElementHandler: 236 # We saw the initial <!xml declaration, so we can be sure that 237 # we are parsing XML data. Most likely, the XML file is 238 # corrupted. 239 raise CorruptedXMLError(e) 240 else: 241 # We have not seen the initial <!xml declaration, so probably 242 # the input data is not in XML format. 243 raise NotXMLError(e) 244 try: 245 return self.object 246 except AttributeError: 247 if self.parser.StartElementHandler: 248 # We saw the initial <!xml declaration, and expat didn't notice 249 # any errors, so self.object should be defined. If not, this is 250 # a bug. 251 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 252 else: 253 # We did not see the initial <!xml declaration, so probably 254 # the input data is not in XML format. 255 raise NotXMLError("XML declaration not found")
256
257 - def parse(self, handle):
258 BLOCK = 1024 259 while True: 260 # Read in another block of the file... 261 text = handle.read(BLOCK) 262 if not text: 263 # We have reached the end of the XML file 264 if self.stack: 265 # No more XML data, but there is still some unfinished 266 # business 267 raise CorruptedXMLError("Premature end of XML stream") 268 try: 269 for record in self.object: 270 yield record 271 except AttributeError: 272 if self.parser.StartElementHandler: 273 # We saw the initial <!xml declaration, and expat 274 # didn't notice any errors, so self.object should be 275 # defined. If not, this is a bug. 276 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 277 else: 278 # We did not see the initial <!xml declaration, so 279 # probably the input data is not in XML format. 280 raise NotXMLError("XML declaration not found") 281 self.parser.Parse("", True) 282 self.parser = None 283 return 284 285 try: 286 self.parser.Parse(text, False) 287 except expat.ExpatError as e: 288 if self.parser.StartElementHandler: 289 # We saw the initial <!xml declaration, so we can be sure 290 # that we are parsing XML data. Most likely, the XML file 291 # is corrupted. 292 raise CorruptedXMLError(e) 293 else: 294 # We have not seen the initial <!xml declaration, so 295 # probably the input data is not in XML format. 296 raise NotXMLError(e) 297 298 if not self.stack: 299 # Haven't read enough from the XML file yet 300 continue 301 302 records = self.stack[0] 303 if not isinstance(records, list): 304 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 305 while len(records) > 1: # Then the top record is finished 306 record = records.pop(0) 307 yield record
308
309 - def xmlDeclHandler(self, version, encoding, standalone):
310 # XML declaration found; set the handlers 311 self.parser.StartElementHandler = self.startElementHandler 312 self.parser.EndElementHandler = self.endElementHandler 313 self.parser.CharacterDataHandler = self.characterDataHandler 314 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 315 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
316
317 - def startNamespaceDeclHandler(self, prefix, un):
318 # This is an xml schema 319 if "Schema" in un: 320 self.is_schema = True 321 else: 322 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
323
324 - def startElementHandler(self, name, attrs):
325 # preprocessing the xml schema 326 if self.is_schema: 327 if len(attrs) == 1: 328 schema = list(attrs.values())[0] 329 handle = self.open_xsd_file(os.path.basename(schema)) 330 # if there is no local xsd file grab the url and parse the file 331 if not handle: 332 handle = _urlopen(schema) 333 text = handle.read() 334 self.save_xsd_file(os.path.basename(schema), text) 335 handle.close() 336 self.parse_xsd(ET.fromstring(text)) 337 else: 338 self.parse_xsd(ET.fromstring(handle.read())) 339 handle.close() 340 self.content = "" 341 if name in self.lists: 342 object = ListElement() 343 elif name in self.dictionaries: 344 object = DictionaryElement() 345 elif name in self.structures: 346 object = StructureElement(self.structures[name]) 347 elif name in self.items: # Only appears in ESummary 348 name = str(attrs["Name"]) # convert from Unicode 349 del attrs["Name"] 350 itemtype = str(attrs["Type"]) # convert from Unicode 351 del attrs["Type"] 352 if itemtype == "Structure": 353 object = DictionaryElement() 354 elif name in ("ArticleIds", "History"): 355 object = StructureElement(["pubmed", "medline"]) 356 elif itemtype == "List": 357 object = ListElement() 358 else: 359 object = StringElement() 360 object.itemname = name 361 object.itemtype = itemtype 362 elif name in self.strings + self.errors + self.integers: 363 self.attributes = attrs 364 return 365 else: 366 # Element not found in DTD 367 if self.validating: 368 raise ValidationError(name) 369 else: 370 # this will not be stored in the record 371 object = "" 372 if object != "": 373 object.tag = name 374 if attrs: 375 object.attributes = dict(attrs) 376 if len(self.stack) != 0: 377 current = self.stack[-1] 378 try: 379 current.append(object) 380 except AttributeError: 381 current[name] = object 382 self.stack.append(object)
383
384 - def endElementHandler(self, name):
385 value = self.content 386 if name in self.errors: 387 if value == "": 388 return 389 else: 390 raise RuntimeError(value) 391 elif name in self.integers: 392 value = IntegerElement(value) 393 elif name in self.strings: 394 # Convert Unicode strings to plain strings if possible 395 try: 396 value = StringElement(value) 397 except UnicodeEncodeError: 398 value = UnicodeElement(value) 399 elif name in self.items: 400 self.object = self.stack.pop() 401 if self.object.itemtype in ("List", "Structure"): 402 return 403 elif self.object.itemtype == "Integer" and value: 404 value = IntegerElement(value) 405 else: 406 # Convert Unicode strings to plain strings if possible 407 try: 408 value = StringElement(value) 409 except UnicodeEncodeError: 410 value = UnicodeElement(value) 411 name = self.object.itemname 412 else: 413 self.object = self.stack.pop() 414 value = re.sub(r"[\s]+", "", value) 415 if self.is_schema and value: 416 self.object.update({'data': value}) 417 return 418 value.tag = name 419 if self.attributes: 420 value.attributes = dict(self.attributes) 421 del self.attributes 422 current = self.stack[-1] 423 if current != "": 424 try: 425 current.append(value) 426 except AttributeError: 427 current[name] = value
428
429 - def characterDataHandler(self, content):
430 self.content += content
431
432 - def parse_xsd(self, root):
433 is_dictionary = False 434 name = "" 435 for child in root: 436 for element in child.getiterator(): 437 if "element" in element.tag: 438 if "name" in element.attrib: 439 name = element.attrib['name'] 440 if "attribute" in element.tag: 441 is_dictionary = True 442 if is_dictionary: 443 self.dictionaries.append(name) 444 is_dictionary = False 445 else: 446 self.lists.append(name)
447
448 - def elementDecl(self, name, model):
449 """Callback function is called for each element declaration in a DTD. 450 451 This is used for each element declaration in a DTD like:: 452 453 <!ELEMENT name (...)> 454 455 The purpose of this function is to determine whether this element 456 should be regarded as a string, integer, list, dictionary, structure, 457 or error. 458 """ 459 if name.upper() == "ERROR": 460 self.errors.append(name) 461 return 462 if name == 'Item' and model == (expat.model.XML_CTYPE_MIXED, 463 expat.model.XML_CQUANT_REP, 464 None, ((expat.model.XML_CTYPE_NAME, 465 expat.model.XML_CQUANT_NONE, 466 'Item', 467 () 468 ), 469 ) 470 ): 471 # Special case. As far as I can tell, this only occurs in the 472 # eSummary DTD. 473 self.items.append(name) 474 return 475 # First, remove ignorable parentheses around declarations 476 while (model[0] in (expat.model.XML_CTYPE_SEQ, 477 expat.model.XML_CTYPE_CHOICE) and 478 model[1] in (expat.model.XML_CQUANT_NONE, 479 expat.model.XML_CQUANT_OPT) and 480 len(model[3]) == 1): 481 model = model[3][0] 482 # PCDATA declarations correspond to strings 483 if model[0] in (expat.model.XML_CTYPE_MIXED, 484 expat.model.XML_CTYPE_EMPTY): 485 self.strings.append(name) 486 return 487 # List-type elements 488 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 489 expat.model.XML_CTYPE_SEQ) and 490 model[1] in (expat.model.XML_CQUANT_PLUS, 491 expat.model.XML_CQUANT_REP)): 492 self.lists.append(name) 493 return 494 # This is the tricky case. Check which keys can occur multiple 495 # times. If only one key is possible, and it can occur multiple 496 # times, then this is a list. If more than one key is possible, 497 # but none of them can occur multiple times, then this is a 498 # dictionary. Otherwise, this is a structure. 499 # In 'single' and 'multiple', we keep track which keys can occur 500 # only once, and which can occur multiple times. 501 single = [] 502 multiple = [] 503 # The 'count' function is called recursively to make sure all the 504 # children in this model are counted. Error keys are ignored; 505 # they raise an exception in Python. 506 507 def count(model): 508 quantifier, name, children = model[1:] 509 if name is None: 510 if quantifier in (expat.model.XML_CQUANT_PLUS, 511 expat.model.XML_CQUANT_REP): 512 for child in children: 513 multiple.append(child[2]) 514 else: 515 for child in children: 516 count(child) 517 elif name.upper() != "ERROR": 518 if quantifier in (expat.model.XML_CQUANT_NONE, 519 expat.model.XML_CQUANT_OPT): 520 single.append(name) 521 elif quantifier in (expat.model.XML_CQUANT_PLUS, 522 expat.model.XML_CQUANT_REP): 523 multiple.append(name)
524 count(model) 525 if len(single) == 0 and len(multiple) == 1: 526 self.lists.append(name) 527 elif len(multiple) == 0: 528 self.dictionaries.append(name) 529 else: 530 self.structures.update({name: multiple})
531
532 - def open_dtd_file(self, filename):
533 path = os.path.join(DataHandler.local_dtd_dir, filename) 534 try: 535 handle = open(path, "rb") 536 except IOError: 537 pass 538 else: 539 return handle 540 path = os.path.join(DataHandler.global_dtd_dir, filename) 541 try: 542 handle = open(path, "rb") 543 except IOError: 544 pass 545 else: 546 return handle 547 return None
548
549 - def open_xsd_file(self, filename):
550 path = os.path.join(DataHandler.local_xsd_dir, filename) 551 try: 552 handle = open(path, "rb") 553 except IOError: 554 pass 555 else: 556 return handle 557 path = os.path.join(DataHandler.global_xsd_dir, filename) 558 try: 559 handle = open(path, "rb") 560 except IOError: 561 pass 562 else: 563 return handle 564 return None
565
566 - def save_dtd_file(self, filename, text):
567 path = os.path.join(DataHandler.local_dtd_dir, filename) 568 try: 569 handle = open(path, "wb") 570 except IOError: 571 warnings.warn("Failed to save %s at %s" % (filename, path)) 572 else: 573 handle.write(text) 574 handle.close()
575
576 - def save_xsd_file(self, filename, text):
577 path = os.path.join(DataHandler.local_xsd_dir, filename) 578 try: 579 handle = open(path, "wb") 580 except IOError: 581 warnings.warn("Failed to save %s at %s" % (filename, path)) 582 else: 583 handle.write(text) 584 handle.close()
585
586 - def externalEntityRefHandler(self, context, base, systemId, publicId):
587 """The purpose of this function is to load the DTD locally, instead 588 of downloading it from the URL specified in the XML. Using the local 589 DTD results in much faster parsing. If the DTD is not found locally, 590 we try to download it. If new DTDs become available from NCBI, 591 putting them in Bio/Entrez/DTDs will allow the parser to see them. 592 """ 593 urlinfo = _urlparse(systemId) 594 # Following attribute requires Python 2.5+ 595 # if urlinfo.scheme=='http': 596 if urlinfo[0] in ['http', 'https', 'ftp']: 597 # Then this is an absolute path to the DTD. 598 url = systemId 599 elif urlinfo[0] == '': 600 # Then this is a relative path to the DTD. 601 # Look at the parent URL to find the full path. 602 try: 603 source = self.dtd_urls[-1] 604 except IndexError: 605 # Assume the default URL for DTDs if the top parent 606 # does not contain an absolute path 607 source = "http://www.ncbi.nlm.nih.gov/dtd/" 608 else: 609 source = os.path.dirname(source) 610 # urls always have a forward slash, don't use os.path.join 611 url = source.rstrip("/") + "/" + systemId 612 else: 613 raise ValueError("Unexpected URL scheme %r" % (urlinfo[0])) 614 self.dtd_urls.append(url) 615 # First, try to load the local version of the DTD file 616 location, filename = os.path.split(systemId) 617 handle = self.open_dtd_file(filename) 618 if not handle: 619 # DTD is not available as a local file. Try accessing it through 620 # the internet instead. 621 try: 622 handle = _urlopen(url) 623 except IOError: 624 raise RuntimeError("Failed to access %s at %s" % (filename, url)) 625 text = handle.read() 626 handle.close() 627 self.save_dtd_file(filename, text) 628 handle = BytesIO(text) 629 630 parser = self.parser.ExternalEntityParserCreate(context) 631 parser.ElementDeclHandler = self.elementDecl 632 parser.ParseFile(handle) 633 handle.close() 634 self.dtd_urls.pop() 635 return 1
636