Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008-2014 by Michiel de Hoon.  All rights reserved. 
  2  # Revisions copyright 2008-2015 by Peter Cock. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Parser for XML results returned by NCBI's Entrez Utilities. 
  8   
  9  This parser is used by the read() function in Bio.Entrez, and is not 
 10  intended be used directly. 
 11   
 12  The question is how to represent an XML file as Python objects. Some 
 13  XML files returned by NCBI look like lists, others look like dictionaries, 
 14  and others look like a mix of lists and dictionaries. 
 15   
 16  My approach is to classify each possible element in the XML as a plain 
 17  string, an integer, a list, a dictionary, or a structure. The latter is a 
 18  dictionary where the same key can occur multiple times; in Python, it is 
 19  represented as a dictionary where that key occurs once, pointing to a list 
 20  of values found in the XML file. 
 21   
 22  The parser then goes through the XML and creates the appropriate Python 
 23  object for each element. The different levels encountered in the XML are 
 24  preserved on the Python side. So a subelement of a subelement of an element 
 25  is a value in a dictionary that is stored in a list which is a value in 
 26  some other dictionary (or a value in a list which itself belongs to a list 
 27  which is a value in a dictionary, and so on). Attributes encountered in 
 28  the XML are stored as a dictionary in a member .attributes of each element, 
 29  and the tag name is saved in a member .tag. 
 30   
 31  To decide which kind of Python object corresponds to each element in the 
 32  XML, the parser analyzes the DTD referred at the top of (almost) every 
 33  XML file returned by the Entrez Utilities. This is preferred over a hand- 
 34  written solution, since the number of DTDs is rather large and their 
 35  contents may change over time. About half the code in this parser deals 
 36  with parsing the DTD, and the other half with the XML itself. 
 37  """ 
 38  import sys 
 39  import re 
 40  import os 
 41  import warnings 
 42  from xml.parsers import expat 
 43  from io import BytesIO 
 44  import xml.etree.ElementTree as ET 
 45   
 46  # Importing these functions with leading underscore as not intended for reuse 
 47  from Bio._py3k import urlopen as _urlopen 
 48  from Bio._py3k import urlparse as _urlparse 
 49  from Bio._py3k import unicode 
 50   
 51   
 52  # The following four classes are used to add a member .attributes to integers, 
 53  # strings, lists, and dictionaries, respectively. 
 54   
 55   
56 -class IntegerElement(int):
57 - def __repr__(self):
58 text = int.__repr__(self) 59 try: 60 attributes = self.attributes 61 except AttributeError: 62 return text 63 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
64 65
66 -class StringElement(str):
67 - def __repr__(self):
68 text = str.__repr__(self) 69 try: 70 attributes = self.attributes 71 except AttributeError: 72 return text 73 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
74 75
76 -class UnicodeElement(unicode):
77 - def __repr__(self):
78 text = unicode.__repr__(self) 79 try: 80 attributes = self.attributes 81 except AttributeError: 82 return text 83 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
84 85
86 -class ListElement(list):
87 - def __repr__(self):
88 text = list.__repr__(self) 89 try: 90 attributes = self.attributes 91 except AttributeError: 92 return text 93 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
94 95
96 -class DictionaryElement(dict):
97 - def __repr__(self):
98 text = dict.__repr__(self) 99 try: 100 attributes = self.attributes 101 except AttributeError: 102 return text 103 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
104 105 106 # A StructureElement is like a dictionary, but some of its keys can have 107 # multiple values associated with it. These values are stored in a list 108 # under each key.
109 -class StructureElement(dict):
110 - def __init__(self, keys):
111 dict.__init__(self) 112 for key in keys: 113 dict.__setitem__(self, key, []) 114 self.listkeys = keys
115
116 - def __setitem__(self, key, value):
117 if key in self.listkeys: 118 self[key].append(value) 119 else: 120 dict.__setitem__(self, key, value)
121
122 - def __repr__(self):
123 text = dict.__repr__(self) 124 try: 125 attributes = self.attributes 126 except AttributeError: 127 return text 128 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
129 130
131 -class NotXMLError(ValueError):
132 - def __init__(self, message):
133 self.msg = message
134
135 - def __str__(self):
136 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
137 138
139 -class CorruptedXMLError(ValueError):
140 - def __init__(self, message):
141 self.msg = message
142
143 - def __str__(self):
144 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
145 146
147 -class ValidationError(ValueError):
148 """XML tag found which was not defined in the DTD. 149 150 Validating parsers raise this error if the parser finds a tag in the XML 151 that is not defined in the DTD. Non-validating parsers do not raise this 152 error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating 153 parsers by default (see those functions for more information). 154 """
155 - def __init__(self, name):
156 self.name = name
157
158 - def __str__(self):
159 return ("Failed to find tag '%s' in the DTD. To skip all tags that " 160 "are not represented in the DTD, please call Bio.Entrez.read " 161 "or Bio.Entrez.parse with validate=False." % self.name)
162 163
164 -class DataHandler(object):
165 166 import platform 167 if platform.system() == 'Windows': 168 directory = os.path.join(os.getenv("APPDATA"), "biopython") 169 else: # Unix/Linux/Mac 170 home = os.path.expanduser('~') 171 directory = os.path.join(home, '.config', 'biopython') 172 del home 173 local_dtd_dir = os.path.join(directory, 'Bio', 'Entrez', 'DTDs') 174 local_xsd_dir = os.path.join(directory, 'Bio', 'Entrez', 'XSDs') 175 del directory 176 del platform 177 try: 178 os.makedirs(local_dtd_dir) # use exist_ok=True on Python >= 3.2 179 except OSError as exception: 180 # Check if local_dtd_dir already exists, and that it is a directory. 181 # Trying os.makedirs first and then checking for os.path.isdir avoids 182 # a race condition. 183 if not os.path.isdir(local_dtd_dir): 184 raise exception 185 try: 186 os.makedirs(local_xsd_dir) # use exist_ok=True on Python >= 3.2 187 except OSError as exception: 188 if not os.path.isdir(local_xsd_dir): 189 raise exception 190 191 from Bio import Entrez 192 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 193 global_xsd_dir = os.path.join(str(Entrez.__path__[0]), "XSDs") 194 del Entrez 195
196 - def __init__(self, validate):
197 self.stack = [] 198 self.errors = [] 199 self.integers = [] 200 self.strings = [] 201 self.lists = [] 202 self.dictionaries = [] 203 self.structures = {} 204 self.items = [] 205 self.dtd_urls = [] 206 self.validating = validate 207 self.parser = expat.ParserCreate(namespace_separator=" ") 208 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 209 self.parser.XmlDeclHandler = self.xmlDeclHandler 210 self.is_schema = False
211
212 - def read(self, handle):
213 """Set up the parser and let it parse the XML results""" 214 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 215 # expects binary data 216 if handle.__class__.__name__ == 'EvilHandleHack': 217 handle = handle._handle 218 if handle.__class__.__name__ == 'TextIOWrapper': 219 handle = handle.buffer 220 if hasattr(handle, "closed") and handle.closed: 221 # Should avoid a possible Segmentation Fault, see: 222 # http://bugs.python.org/issue4877 223 raise IOError("Can't parse a closed handle") 224 if sys.version_info[0] >= 3: 225 # Another nasty hack to cope with a unicode StringIO handle 226 # since the Entrez XML parser expects binary data (bytes) 227 from io import StringIO 228 if isinstance(handle, StringIO): 229 from io import BytesIO 230 from Bio._py3k import _as_bytes 231 handle = BytesIO(_as_bytes(handle.read())) 232 try: 233 self.parser.ParseFile(handle) 234 except expat.ExpatError as e: 235 if self.parser.StartElementHandler: 236 # We saw the initial <!xml declaration, so we can be sure that 237 # we are parsing XML data. Most likely, the XML file is 238 # corrupted. 239 raise CorruptedXMLError(e) 240 else: 241 # We have not seen the initial <!xml declaration, so probably 242 # the input data is not in XML format. 243 raise NotXMLError(e) 244 try: 245 return self.object 246 except AttributeError: 247 if self.parser.StartElementHandler: 248 # We saw the initial <!xml declaration, and expat didn't notice 249 # any errors, so self.object should be defined. If not, this is 250 # a bug. 251 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 252 else: 253 # We did not see the initial <!xml declaration, so probably 254 # the input data is not in XML format. 255 raise NotXMLError("XML declaration not found")
256
257 - def parse(self, handle):
258 BLOCK = 1024 259 while True: 260 # Read in another block of the file... 261 text = handle.read(BLOCK) 262 if not text: 263 # We have reached the end of the XML file 264 if self.stack: 265 # No more XML data, but there is still some unfinished 266 # business 267 raise CorruptedXMLError("Premature end of XML stream") 268 try: 269 for record in self.object: 270 yield record 271 except AttributeError: 272 if self.parser.StartElementHandler: 273 # We saw the initial <!xml declaration, and expat 274 # didn't notice any errors, so self.object should be 275 # defined. If not, this is a bug. 276 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 277 else: 278 # We did not see the initial <!xml declaration, so 279 # probably the input data is not in XML format. 280 raise NotXMLError("XML declaration not found") 281 self.parser.Parse("", True) 282 self.parser = None 283 return 284 285 try: 286 self.parser.Parse(text, False) 287 except expat.ExpatError as e: 288 if self.parser.StartElementHandler: 289 # We saw the initial <!xml declaration, so we can be sure 290 # that we are parsing XML data. Most likely, the XML file 291 # is corrupted. 292 raise CorruptedXMLError(e) 293 else: 294 # We have not seen the initial <!xml declaration, so 295 # probably the input data is not in XML format. 296 raise NotXMLError(e) 297 298 if not self.stack: 299 # Haven't read enough from the XML file yet 300 continue 301 302 records = self.stack[0] 303 if not isinstance(records, list): 304 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 305 while len(records) > 1: # Then the top record is finished 306 record = records.pop(0) 307 yield record
308
309 - def xmlDeclHandler(self, version, encoding, standalone):
310 # XML declaration found; set the handlers 311 self.parser.StartElementHandler = self.startElementHandler 312 self.parser.EndElementHandler = self.endElementHandler 313 self.parser.CharacterDataHandler = self.characterDataHandler 314 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 315 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
316
317 - def startNamespaceDeclHandler(self, prefix, un):
318 # This is an xml schema 319 if "Schema" in un: 320 self.is_schema = True 321 else: 322 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
323
324 - def startElementHandler(self, name, attrs):
325 # preprocessing the xml schema 326 if self.is_schema: 327 if len(attrs) == 1: 328 schema = list(attrs.values())[0] 329 handle = self.open_xsd_file(os.path.basename(schema)) 330 # if there is no local xsd file grab the url and parse the file 331 if not handle: 332 handle = _urlopen(schema) 333 text = handle.read() 334 self.save_xsd_file(os.path.basename(schema), text) 335 handle.close() 336 self.parse_xsd(ET.fromstring(text)) 337 else: 338 self.parse_xsd(ET.fromstring(handle.read())) 339 handle.close() 340 self.content = "" 341 if name in self.lists: 342 object = ListElement() 343 elif name in self.dictionaries: 344 object = DictionaryElement() 345 elif name in self.structures: 346 object = StructureElement(self.structures[name]) 347 elif name in self.items: # Only appears in ESummary 348 name = str(attrs["Name"]) # convert from Unicode 349 del attrs["Name"] 350 itemtype = str(attrs["Type"]) # convert from Unicode 351 del attrs["Type"] 352 if itemtype == "Structure": 353 object = DictionaryElement() 354 elif name in ("ArticleIds", "History"): 355 object = StructureElement(["pubmed", "medline"]) 356 elif itemtype == "List": 357 object = ListElement() 358 else: 359 object = StringElement() 360 object.itemname = name 361 object.itemtype = itemtype 362 elif name in self.strings + self.errors + self.integers: 363 self.attributes = attrs 364 return 365 else: 366 # Element not found in DTD 367 if self.validating: 368 raise ValidationError(name) 369 else: 370 # this will not be stored in the record 371 object = "" 372 if object != "": 373 object.tag = name 374 if attrs: 375 object.attributes = dict(attrs) 376 if len(self.stack) != 0: 377 current = self.stack[-1] 378 try: 379 current.append(object) 380 except AttributeError: 381 current[name] = object 382 self.stack.append(object)
383
384 - def endElementHandler(self, name):
385 value = self.content 386 if name in self.errors: 387 if value == "": 388 return 389 else: 390 raise RuntimeError(value) 391 elif name in self.integers: 392 value = IntegerElement(value) 393 elif name in self.strings: 394 # Convert Unicode strings to plain strings if possible 395 try: 396 value = StringElement(value) 397 except UnicodeEncodeError: 398 value = UnicodeElement(value) 399 elif name in self.items: 400 self.object = self.stack.pop() 401 if self.object.itemtype in ("List", "Structure"): 402 return 403 elif self.object.itemtype == "Integer" and value: 404 value = IntegerElement(value) 405 else: 406 # Convert Unicode strings to plain strings if possible 407 try: 408 value = StringElement(value) 409 except UnicodeEncodeError: 410 value = UnicodeElement(value) 411 name = self.object.itemname 412 else: 413 self.object = self.stack.pop() 414 value = re.sub(r"[\s]+", "", value) 415 if self.is_schema and value: 416 self.object.update({'data': value}) 417 return 418 value.tag = name 419 if self.attributes: 420 value.attributes = dict(self.attributes) 421 del self.attributes 422 current = self.stack[-1] 423 if current != "": 424 try: 425 current.append(value) 426 except AttributeError: 427 current[name] = value
428
429 - def characterDataHandler(self, content):
430 self.content += content
431
432 - def parse_xsd(self, root):
433 is_dictionary = False 434 name = "" 435 for child in root: 436 for element in child.getiterator(): 437 if "element" in element.tag: 438 if "name" in element.attrib: 439 name = element.attrib['name'] 440 if "attribute" in element.tag: 441 is_dictionary = True 442 if is_dictionary: 443 self.dictionaries.append(name) 444 is_dictionary = False 445 else: 446 self.lists.append(name)
447
448 - def elementDecl(self, name, model):
449 """This callback function is called for each element declaration: 450 <!ELEMENT name (...)> 451 encountered in a DTD. The purpose of this function is to determine 452 whether this element should be regarded as a string, integer, list 453 dictionary, structure, or error.""" 454 if name.upper() == "ERROR": 455 self.errors.append(name) 456 return 457 if name == 'Item' and model == (expat.model.XML_CTYPE_MIXED, 458 expat.model.XML_CQUANT_REP, 459 None, ((expat.model.XML_CTYPE_NAME, 460 expat.model.XML_CQUANT_NONE, 461 'Item', 462 () 463 ), 464 ) 465 ): 466 # Special case. As far as I can tell, this only occurs in the 467 # eSummary DTD. 468 self.items.append(name) 469 return 470 # First, remove ignorable parentheses around declarations 471 while (model[0] in (expat.model.XML_CTYPE_SEQ, 472 expat.model.XML_CTYPE_CHOICE) and 473 model[1] in (expat.model.XML_CQUANT_NONE, 474 expat.model.XML_CQUANT_OPT) and 475 len(model[3]) == 1): 476 model = model[3][0] 477 # PCDATA declarations correspond to strings 478 if model[0] in (expat.model.XML_CTYPE_MIXED, 479 expat.model.XML_CTYPE_EMPTY): 480 self.strings.append(name) 481 return 482 # List-type elements 483 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 484 expat.model.XML_CTYPE_SEQ) and 485 model[1] in (expat.model.XML_CQUANT_PLUS, 486 expat.model.XML_CQUANT_REP)): 487 self.lists.append(name) 488 return 489 # This is the tricky case. Check which keys can occur multiple 490 # times. If only one key is possible, and it can occur multiple 491 # times, then this is a list. If more than one key is possible, 492 # but none of them can occur multiple times, then this is a 493 # dictionary. Otherwise, this is a structure. 494 # In 'single' and 'multiple', we keep track which keys can occur 495 # only once, and which can occur multiple times. 496 single = [] 497 multiple = [] 498 # The 'count' function is called recursively to make sure all the 499 # children in this model are counted. Error keys are ignored; 500 # they raise an exception in Python. 501 502 def count(model): 503 quantifier, name, children = model[1:] 504 if name is None: 505 if quantifier in (expat.model.XML_CQUANT_PLUS, 506 expat.model.XML_CQUANT_REP): 507 for child in children: 508 multiple.append(child[2]) 509 else: 510 for child in children: 511 count(child) 512 elif name.upper() != "ERROR": 513 if quantifier in (expat.model.XML_CQUANT_NONE, 514 expat.model.XML_CQUANT_OPT): 515 single.append(name) 516 elif quantifier in (expat.model.XML_CQUANT_PLUS, 517 expat.model.XML_CQUANT_REP): 518 multiple.append(name)
519 count(model) 520 if len(single) == 0 and len(multiple) == 1: 521 self.lists.append(name) 522 elif len(multiple) == 0: 523 self.dictionaries.append(name) 524 else: 525 self.structures.update({name: multiple})
526
527 - def open_dtd_file(self, filename):
528 path = os.path.join(DataHandler.local_dtd_dir, filename) 529 try: 530 handle = open(path, "rb") 531 except IOError: 532 pass 533 else: 534 return handle 535 path = os.path.join(DataHandler.global_dtd_dir, filename) 536 try: 537 handle = open(path, "rb") 538 except IOError: 539 pass 540 else: 541 return handle 542 return None
543
544 - def open_xsd_file(self, filename):
545 path = os.path.join(DataHandler.local_xsd_dir, filename) 546 try: 547 handle = open(path, "rb") 548 except IOError: 549 pass 550 else: 551 return handle 552 path = os.path.join(DataHandler.global_xsd_dir, filename) 553 try: 554 handle = open(path, "rb") 555 except IOError: 556 pass 557 else: 558 return handle 559 return None
560
561 - def save_dtd_file(self, filename, text):
562 path = os.path.join(DataHandler.local_dtd_dir, filename) 563 try: 564 handle = open(path, "wb") 565 except IOError: 566 warnings.warn("Failed to save %s at %s" % (filename, path)) 567 else: 568 handle.write(text) 569 handle.close()
570
571 - def save_xsd_file(self, filename, text):
572 path = os.path.join(DataHandler.local_xsd_dir, filename) 573 try: 574 handle = open(path, "wb") 575 except IOError: 576 warnings.warn("Failed to save %s at %s" % (filename, path)) 577 else: 578 handle.write(text) 579 handle.close()
580
581 - def externalEntityRefHandler(self, context, base, systemId, publicId):
582 """The purpose of this function is to load the DTD locally, instead 583 of downloading it from the URL specified in the XML. Using the local 584 DTD results in much faster parsing. If the DTD is not found locally, 585 we try to download it. If new DTDs become available from NCBI, 586 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 587 urlinfo = _urlparse(systemId) 588 # Following attribute requires Python 2.5+ 589 # if urlinfo.scheme=='http': 590 if urlinfo[0] in ['http', 'https', 'ftp']: 591 # Then this is an absolute path to the DTD. 592 url = systemId 593 elif urlinfo[0] == '': 594 # Then this is a relative path to the DTD. 595 # Look at the parent URL to find the full path. 596 try: 597 source = self.dtd_urls[-1] 598 except IndexError: 599 # Assume the default URL for DTDs if the top parent 600 # does not contain an absolute path 601 source = "http://www.ncbi.nlm.nih.gov/dtd/" 602 else: 603 source = os.path.dirname(source) 604 # urls always have a forward slash, don't use os.path.join 605 url = source.rstrip("/") + "/" + systemId 606 else: 607 raise ValueError("Unexpected URL scheme %r" % (urlinfo[0])) 608 self.dtd_urls.append(url) 609 # First, try to load the local version of the DTD file 610 location, filename = os.path.split(systemId) 611 handle = self.open_dtd_file(filename) 612 if not handle: 613 # DTD is not available as a local file. Try accessing it through 614 # the internet instead. 615 try: 616 handle = _urlopen(url) 617 except IOError: 618 raise RuntimeError("Failed to access %s at %s" % (filename, url)) 619 text = handle.read() 620 handle.close() 621 self.save_dtd_file(filename, text) 622 handle = BytesIO(text) 623 624 parser = self.parser.ExternalEntityParserCreate(context) 625 parser.ElementDeclHandler = self.elementDecl 626 parser.ParseFile(handle) 627 handle.close() 628 self.dtd_urls.pop() 629 return 1
630