Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008-2014 by Michiel de Hoon.  All rights reserved. 
  2  # Revisions copyright 2008-2015 by Peter Cock. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Parser for XML results returned by NCBI's Entrez Utilities. 
  8   
  9  This parser is used by the read() function in Bio.Entrez, and is not 
 10  intended be used directly. 
 11   
 12  The question is how to represent an XML file as Python objects. Some 
 13  XML files returned by NCBI look like lists, others look like dictionaries, 
 14  and others look like a mix of lists and dictionaries. 
 15   
 16  My approach is to classify each possible element in the XML as a plain 
 17  string, an integer, a list, a dictionary, or a structure. The latter is a 
 18  dictionary where the same key can occur multiple times; in Python, it is 
 19  represented as a dictionary where that key occurs once, pointing to a list 
 20  of values found in the XML file. 
 21   
 22  The parser then goes through the XML and creates the appropriate Python 
 23  object for each element. The different levels encountered in the XML are 
 24  preserved on the Python side. So a subelement of a subelement of an element 
 25  is a value in a dictionary that is stored in a list which is a value in 
 26  some other dictionary (or a value in a list which itself belongs to a list 
 27  which is a value in a dictionary, and so on). Attributes encountered in 
 28  the XML are stored as a dictionary in a member .attributes of each element, 
 29  and the tag name is saved in a member .tag. 
 30   
 31  To decide which kind of Python object corresponds to each element in the 
 32  XML, the parser analyzes the DTD referred at the top of (almost) every 
 33  XML file returned by the Entrez Utilities. This is preferred over a hand- 
 34  written solution, since the number of DTDs is rather large and their 
 35  contents may change over time. About half the code in this parser deals 
 36  wih parsing the DTD, and the other half with the XML itself. 
 37  """ 
 38  import re 
 39  import os 
 40  import warnings 
 41  from xml.parsers import expat 
 42  from io import BytesIO 
 43  import xml.etree.ElementTree as ET 
 44   
 45  # Importing these functions with leading underscore as not intended for reuse 
 46  from Bio._py3k import urlopen as _urlopen 
 47  from Bio._py3k import urlparse as _urlparse 
 48  from Bio._py3k import unicode 
 49   
 50  __docformat__ = "restructuredtext en" 
 51   
 52  # The following four classes are used to add a member .attributes to integers, 
 53  # strings, lists, and dictionaries, respectively. 
 54   
 55   
56 -class IntegerElement(int):
57 - def __repr__(self):
58 text = int.__repr__(self) 59 try: 60 attributes = self.attributes 61 except AttributeError: 62 return text 63 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
64 65
66 -class StringElement(str):
67 - def __repr__(self):
68 text = str.__repr__(self) 69 try: 70 attributes = self.attributes 71 except AttributeError: 72 return text 73 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
74 75
76 -class UnicodeElement(unicode):
77 - def __repr__(self):
78 text = unicode.__repr__(self) 79 try: 80 attributes = self.attributes 81 except AttributeError: 82 return text 83 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
84 85
86 -class ListElement(list):
87 - def __repr__(self):
88 text = list.__repr__(self) 89 try: 90 attributes = self.attributes 91 except AttributeError: 92 return text 93 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
94 95
96 -class DictionaryElement(dict):
97 - def __repr__(self):
98 text = dict.__repr__(self) 99 try: 100 attributes = self.attributes 101 except AttributeError: 102 return text 103 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
104 105 106 # A StructureElement is like a dictionary, but some of its keys can have 107 # multiple values associated with it. These values are stored in a list 108 # under each key.
109 -class StructureElement(dict):
110 - def __init__(self, keys):
111 dict.__init__(self) 112 for key in keys: 113 dict.__setitem__(self, key, []) 114 self.listkeys = keys
115
116 - def __setitem__(self, key, value):
117 if key in self.listkeys: 118 self[key].append(value) 119 else: 120 dict.__setitem__(self, key, value)
121
122 - def __repr__(self):
123 text = dict.__repr__(self) 124 try: 125 attributes = self.attributes 126 except AttributeError: 127 return text 128 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
129 130
131 -class NotXMLError(ValueError):
132 - def __init__(self, message):
133 self.msg = message
134
135 - def __str__(self):
136 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
137 138
139 -class CorruptedXMLError(ValueError):
140 - def __init__(self, message):
141 self.msg = message
142
143 - def __str__(self):
144 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
145 146
147 -class ValidationError(ValueError):
148 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
149 - def __init__(self, name):
150 self.name = name
151
152 - def __str__(self):
153 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
154 155
156 -class DataHandler(object):
157 158 import platform 159 if platform.system() == 'Windows': 160 directory = os.path.join(os.getenv("APPDATA"), "biopython") 161 else: # Unix/Linux/Mac 162 home = os.path.expanduser('~') 163 directory = os.path.join(home, '.config', 'biopython') 164 del home 165 local_dtd_dir = os.path.join(directory, 'Bio', 'Entrez', 'DTDs') 166 local_xsd_dir = os.path.join(directory, 'Bio', 'Entrez', 'XSDs') 167 del directory 168 del platform 169 try: 170 os.makedirs(local_dtd_dir) # use exist_ok=True on Python >= 3.2 171 except OSError as exception: 172 # Check if local_dtd_dir already exists, and that it is a directory. 173 # Trying os.makedirs first and then checking for os.path.isdir avoids 174 # a race condition. 175 if not os.path.isdir(local_dtd_dir): 176 raise exception 177 try: 178 os.makedirs(local_xsd_dir) # use exist_ok=True on Python >= 3.2 179 except OSError as exception: 180 if not os.path.isdir(local_xsd_dir): 181 raise exception 182 183 from Bio import Entrez 184 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 185 global_xsd_dir = os.path.join(str(Entrez.__path__[0]), "XSDs") 186 del Entrez 187
188 - def __init__(self, validate):
189 self.stack = [] 190 self.errors = [] 191 self.integers = [] 192 self.strings = [] 193 self.lists = [] 194 self.dictionaries = [] 195 self.structures = {} 196 self.items = [] 197 self.dtd_urls = [] 198 self.validating = validate 199 self.parser = expat.ParserCreate(namespace_separator=" ") 200 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 201 self.parser.XmlDeclHandler = self.xmlDeclHandler 202 self.is_schema = False
203
204 - def read(self, handle):
205 """Set up the parser and let it parse the XML results""" 206 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 207 # expects binary data 208 if handle.__class__.__name__ == 'EvilHandleHack': 209 handle = handle._handle 210 if hasattr(handle, "closed") and handle.closed: 211 # Should avoid a possible Segmentation Fault, see: 212 # http://bugs.python.org/issue4877 213 raise IOError("Can't parse a closed handle") 214 try: 215 self.parser.ParseFile(handle) 216 except expat.ExpatError as e: 217 if self.parser.StartElementHandler: 218 # We saw the initial <!xml declaration, so we can be sure that 219 # we are parsing XML data. Most likely, the XML file is 220 # corrupted. 221 raise CorruptedXMLError(e) 222 else: 223 # We have not seen the initial <!xml declaration, so probably 224 # the input data is not in XML format. 225 raise NotXMLError(e) 226 try: 227 return self.object 228 except AttributeError: 229 if self.parser.StartElementHandler: 230 # We saw the initial <!xml declaration, and expat didn't notice 231 # any errors, so self.object should be defined. If not, this is 232 # a bug. 233 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 234 else: 235 # We did not see the initial <!xml declaration, so probably 236 # the input data is not in XML format. 237 raise NotXMLError("XML declaration not found")
238
239 - def parse(self, handle):
240 BLOCK = 1024 241 while True: 242 # Read in another block of the file... 243 text = handle.read(BLOCK) 244 if not text: 245 # We have reached the end of the XML file 246 if self.stack: 247 # No more XML data, but there is still some unfinished 248 # business 249 raise CorruptedXMLError("Premature end of XML stream") 250 try: 251 for record in self.object: 252 yield record 253 except AttributeError: 254 if self.parser.StartElementHandler: 255 # We saw the initial <!xml declaration, and expat 256 # didn't notice any errors, so self.object should be 257 # defined. If not, this is a bug. 258 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 259 else: 260 # We did not see the initial <!xml declaration, so 261 # probably the input data is not in XML format. 262 raise NotXMLError("XML declaration not found") 263 self.parser.Parse("", True) 264 self.parser = None 265 return 266 267 try: 268 self.parser.Parse(text, False) 269 except expat.ExpatError as e: 270 if self.parser.StartElementHandler: 271 # We saw the initial <!xml declaration, so we can be sure 272 # that we are parsing XML data. Most likely, the XML file 273 # is corrupted. 274 raise CorruptedXMLError(e) 275 else: 276 # We have not seen the initial <!xml declaration, so 277 # probably the input data is not in XML format. 278 raise NotXMLError(e) 279 280 if not self.stack: 281 # Haven't read enough from the XML file yet 282 continue 283 284 records = self.stack[0] 285 if not isinstance(records, list): 286 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 287 while len(records) > 1: # Then the top record is finished 288 record = records.pop(0) 289 yield record
290
291 - def xmlDeclHandler(self, version, encoding, standalone):
292 # XML declaration found; set the handlers 293 self.parser.StartElementHandler = self.startElementHandler 294 self.parser.EndElementHandler = self.endElementHandler 295 self.parser.CharacterDataHandler = self.characterDataHandler 296 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 297 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
298
299 - def startNamespaceDeclHandler(self, prefix, un):
300 # This is an xml schema 301 if "Schema" in un: 302 self.is_schema = True 303 else: 304 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
305
306 - def startElementHandler(self, name, attrs):
307 # preprocessing the xml schema 308 if self.is_schema: 309 if len(attrs) == 1: 310 schema = list(attrs.values())[0] 311 handle = self.open_xsd_file(os.path.basename(schema)) 312 # if there is no local xsd file grab the url and parse the file 313 if not handle: 314 handle = _urlopen(schema) 315 text = handle.read() 316 self.save_xsd_file(os.path.basename(schema), text) 317 handle.close() 318 self.parse_xsd(ET.fromstring(text)) 319 else: 320 self.parse_xsd(ET.fromstring(handle.read())) 321 handle.close() 322 self.content = "" 323 if name in self.lists: 324 object = ListElement() 325 elif name in self.dictionaries: 326 object = DictionaryElement() 327 elif name in self.structures: 328 object = StructureElement(self.structures[name]) 329 elif name in self.items: # Only appears in ESummary 330 name = str(attrs["Name"]) # convert from Unicode 331 del attrs["Name"] 332 itemtype = str(attrs["Type"]) # convert from Unicode 333 del attrs["Type"] 334 if itemtype == "Structure": 335 object = DictionaryElement() 336 elif name in ("ArticleIds", "History"): 337 object = StructureElement(["pubmed", "medline"]) 338 elif itemtype == "List": 339 object = ListElement() 340 else: 341 object = StringElement() 342 object.itemname = name 343 object.itemtype = itemtype 344 elif name in self.strings + self.errors + self.integers: 345 self.attributes = attrs 346 return 347 else: 348 # Element not found in DTD 349 if self.validating: 350 raise ValidationError(name) 351 else: 352 # this will not be stored in the record 353 object = "" 354 if object != "": 355 object.tag = name 356 if attrs: 357 object.attributes = dict(attrs) 358 if len(self.stack) != 0: 359 current = self.stack[-1] 360 try: 361 current.append(object) 362 except AttributeError: 363 current[name] = object 364 self.stack.append(object)
365
366 - def endElementHandler(self, name):
367 value = self.content 368 if name in self.errors: 369 if value == "": 370 return 371 else: 372 raise RuntimeError(value) 373 elif name in self.integers: 374 value = IntegerElement(value) 375 elif name in self.strings: 376 # Convert Unicode strings to plain strings if possible 377 try: 378 value = StringElement(value) 379 except UnicodeEncodeError: 380 value = UnicodeElement(value) 381 elif name in self.items: 382 self.object = self.stack.pop() 383 if self.object.itemtype in ("List", "Structure"): 384 return 385 elif self.object.itemtype == "Integer" and value: 386 value = IntegerElement(value) 387 else: 388 # Convert Unicode strings to plain strings if possible 389 try: 390 value = StringElement(value) 391 except UnicodeEncodeError: 392 value = UnicodeElement(value) 393 name = self.object.itemname 394 else: 395 self.object = self.stack.pop() 396 value = re.sub(r"[\s]+", "", value) 397 if self.is_schema and value: 398 self.object.update({'data': value}) 399 return 400 value.tag = name 401 if self.attributes: 402 value.attributes = dict(self.attributes) 403 del self.attributes 404 current = self.stack[-1] 405 if current != "": 406 try: 407 current.append(value) 408 except AttributeError: 409 current[name] = value
410
411 - def characterDataHandler(self, content):
412 self.content += content
413
414 - def parse_xsd(self, root):
415 is_dictionary = False 416 name = "" 417 for child in root: 418 for element in child.getiterator(): 419 if "element" in element.tag: 420 if "name" in element.attrib: 421 name = element.attrib['name'] 422 if "attribute" in element.tag: 423 is_dictionary = True 424 if is_dictionary: 425 self.dictionaries.append(name) 426 is_dictionary = False 427 else: 428 self.lists.append(name)
429
430 - def elementDecl(self, name, model):
431 """This callback function is called for each element declaration: 432 <!ELEMENT name (...)> 433 encountered in a DTD. The purpose of this function is to determine 434 whether this element should be regarded as a string, integer, list 435 dictionary, structure, or error.""" 436 if name.upper() == "ERROR": 437 self.errors.append(name) 438 return 439 if name == 'Item' and model == (expat.model.XML_CTYPE_MIXED, 440 expat.model.XML_CQUANT_REP, 441 None, ((expat.model.XML_CTYPE_NAME, 442 expat.model.XML_CQUANT_NONE, 443 'Item', 444 () 445 ), 446 ) 447 ): 448 # Special case. As far as I can tell, this only occurs in the 449 # eSummary DTD. 450 self.items.append(name) 451 return 452 # First, remove ignorable parentheses around declarations 453 while (model[0] in (expat.model.XML_CTYPE_SEQ, 454 expat.model.XML_CTYPE_CHOICE) 455 and model[1] in (expat.model.XML_CQUANT_NONE, 456 expat.model.XML_CQUANT_OPT) 457 and len(model[3]) == 1): 458 model = model[3][0] 459 # PCDATA declarations correspond to strings 460 if model[0] in (expat.model.XML_CTYPE_MIXED, 461 expat.model.XML_CTYPE_EMPTY): 462 self.strings.append(name) 463 return 464 # List-type elements 465 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 466 expat.model.XML_CTYPE_SEQ) and 467 model[1] in (expat.model.XML_CQUANT_PLUS, 468 expat.model.XML_CQUANT_REP)): 469 self.lists.append(name) 470 return 471 # This is the tricky case. Check which keys can occur multiple 472 # times. If only one key is possible, and it can occur multiple 473 # times, then this is a list. If more than one key is possible, 474 # but none of them can occur multiple times, then this is a 475 # dictionary. Otherwise, this is a structure. 476 # In 'single' and 'multiple', we keep track which keys can occur 477 # only once, and which can occur multiple times. 478 single = [] 479 multiple = [] 480 # The 'count' function is called recursively to make sure all the 481 # children in this model are counted. Error keys are ignored; 482 # they raise an exception in Python. 483 484 def count(model): 485 quantifier, name, children = model[1:] 486 if name is None: 487 if quantifier in (expat.model.XML_CQUANT_PLUS, 488 expat.model.XML_CQUANT_REP): 489 for child in children: 490 multiple.append(child[2]) 491 else: 492 for child in children: 493 count(child) 494 elif name.upper() != "ERROR": 495 if quantifier in (expat.model.XML_CQUANT_NONE, 496 expat.model.XML_CQUANT_OPT): 497 single.append(name) 498 elif quantifier in (expat.model.XML_CQUANT_PLUS, 499 expat.model.XML_CQUANT_REP): 500 multiple.append(name)
501 count(model) 502 if len(single) == 0 and len(multiple) == 1: 503 self.lists.append(name) 504 elif len(multiple) == 0: 505 self.dictionaries.append(name) 506 else: 507 self.structures.update({name: multiple})
508
509 - def open_dtd_file(self, filename):
510 path = os.path.join(DataHandler.local_dtd_dir, filename) 511 try: 512 handle = open(path, "rb") 513 except IOError: 514 pass 515 else: 516 return handle 517 path = os.path.join(DataHandler.global_dtd_dir, filename) 518 try: 519 handle = open(path, "rb") 520 except IOError: 521 pass 522 else: 523 return handle 524 return None
525
526 - def open_xsd_file(self, filename):
527 path = os.path.join(DataHandler.local_xsd_dir, filename) 528 try: 529 handle = open(path, "rb") 530 except IOError: 531 pass 532 else: 533 return handle 534 path = os.path.join(DataHandler.global_xsd_dir, filename) 535 try: 536 handle = open(path, "rb") 537 except IOError: 538 pass 539 else: 540 return handle 541 return None
542
543 - def save_dtd_file(self, filename, text):
544 path = os.path.join(DataHandler.local_dtd_dir, filename) 545 try: 546 handle = open(path, "wb") 547 except IOError: 548 warnings.warn("Failed to save %s at %s" % (filename, path)) 549 else: 550 handle.write(text) 551 handle.close()
552
553 - def save_xsd_file(self, filename, text):
554 path = os.path.join(DataHandler.local_xsd_dir, filename) 555 try: 556 handle = open(path, "wb") 557 except IOError: 558 warnings.warn("Failed to save %s at %s" % (filename, path)) 559 else: 560 handle.write(text) 561 handle.close()
562
563 - def externalEntityRefHandler(self, context, base, systemId, publicId):
564 """The purpose of this function is to load the DTD locally, instead 565 of downloading it from the URL specified in the XML. Using the local 566 DTD results in much faster parsing. If the DTD is not found locally, 567 we try to download it. If new DTDs become available from NCBI, 568 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 569 urlinfo = _urlparse(systemId) 570 # Following attribute requires Python 2.5+ 571 # if urlinfo.scheme=='http': 572 if urlinfo[0] in ['http', 'https', 'ftp']: 573 # Then this is an absolute path to the DTD. 574 url = systemId 575 elif urlinfo[0] == '': 576 # Then this is a relative path to the DTD. 577 # Look at the parent URL to find the full path. 578 try: 579 source = self.dtd_urls[-1] 580 except IndexError: 581 # Assume the default URL for DTDs if the top parent 582 # does not contain an absolute path 583 source = "http://www.ncbi.nlm.nih.gov/dtd/" 584 else: 585 source = os.path.dirname(source) 586 # urls always have a forward slash, don't use os.path.join 587 url = source.rstrip("/") + "/" + systemId 588 else: 589 raise ValueError("Unexpected URL scheme %r" % (urlinfo[0])) 590 self.dtd_urls.append(url) 591 # First, try to load the local version of the DTD file 592 location, filename = os.path.split(systemId) 593 handle = self.open_dtd_file(filename) 594 if not handle: 595 # DTD is not available as a local file. Try accessing it through 596 # the internet instead. 597 try: 598 handle = _urlopen(url) 599 except IOError: 600 raise RuntimeError("Failed to access %s at %s" % (filename, url)) 601 text = handle.read() 602 handle.close() 603 self.save_dtd_file(filename, text) 604 handle = BytesIO(text) 605 606 parser = self.parser.ExternalEntityParserCreate(context) 607 parser.ElementDeclHandler = self.elementDecl 608 parser.ParseFile(handle) 609 handle.close() 610 self.dtd_urls.pop() 611 return 1
612