Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008-2014 by Michiel de Hoon.  All rights reserved. 
  2  # Revisions copyright 2008-2015 by Peter Cock. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Parser for XML results returned by NCBI's Entrez Utilities. 
  8   
  9  This parser is used by the read() function in Bio.Entrez, and is not 
 10  intended be used directly. 
 11   
 12  The question is how to represent an XML file as Python objects. Some 
 13  XML files returned by NCBI look like lists, others look like dictionaries, 
 14  and others look like a mix of lists and dictionaries. 
 15   
 16  My approach is to classify each possible element in the XML as a plain 
 17  string, an integer, a list, a dictionary, or a structure. The latter is a 
 18  dictionary where the same key can occur multiple times; in Python, it is 
 19  represented as a dictionary where that key occurs once, pointing to a list 
 20  of values found in the XML file. 
 21   
 22  The parser then goes through the XML and creates the appropriate Python 
 23  object for each element. The different levels encountered in the XML are 
 24  preserved on the Python side. So a subelement of a subelement of an element 
 25  is a value in a dictionary that is stored in a list which is a value in 
 26  some other dictionary (or a value in a list which itself belongs to a list 
 27  which is a value in a dictionary, and so on). Attributes encountered in 
 28  the XML are stored as a dictionary in a member .attributes of each element, 
 29  and the tag name is saved in a member .tag. 
 30   
 31  To decide which kind of Python object corresponds to each element in the 
 32  XML, the parser analyzes the DTD referred at the top of (almost) every 
 33  XML file returned by the Entrez Utilities. This is preferred over a hand- 
 34  written solution, since the number of DTDs is rather large and their 
 35  contents may change over time. About half the code in this parser deals 
 36  wih parsing the DTD, and the other half with the XML itself. 
 37  """ 
 38  import sys 
 39  import re 
 40  import os 
 41  import warnings 
 42  from xml.parsers import expat 
 43  from io import BytesIO 
 44  import xml.etree.ElementTree as ET 
 45   
 46  # Importing these functions with leading underscore as not intended for reuse 
 47  from Bio._py3k import urlopen as _urlopen 
 48  from Bio._py3k import urlparse as _urlparse 
 49  from Bio._py3k import unicode 
 50   
 51   
 52  # The following four classes are used to add a member .attributes to integers, 
 53  # strings, lists, and dictionaries, respectively. 
 54   
 55   
56 -class IntegerElement(int):
57 - def __repr__(self):
58 text = int.__repr__(self) 59 try: 60 attributes = self.attributes 61 except AttributeError: 62 return text 63 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
64 65
66 -class StringElement(str):
67 - def __repr__(self):
68 text = str.__repr__(self) 69 try: 70 attributes = self.attributes 71 except AttributeError: 72 return text 73 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
74 75
76 -class UnicodeElement(unicode):
77 - def __repr__(self):
78 text = unicode.__repr__(self) 79 try: 80 attributes = self.attributes 81 except AttributeError: 82 return text 83 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
84 85
86 -class ListElement(list):
87 - def __repr__(self):
88 text = list.__repr__(self) 89 try: 90 attributes = self.attributes 91 except AttributeError: 92 return text 93 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
94 95
96 -class DictionaryElement(dict):
97 - def __repr__(self):
98 text = dict.__repr__(self) 99 try: 100 attributes = self.attributes 101 except AttributeError: 102 return text 103 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
104 105 106 # A StructureElement is like a dictionary, but some of its keys can have 107 # multiple values associated with it. These values are stored in a list 108 # under each key.
109 -class StructureElement(dict):
110 - def __init__(self, keys):
111 dict.__init__(self) 112 for key in keys: 113 dict.__setitem__(self, key, []) 114 self.listkeys = keys
115
116 - def __setitem__(self, key, value):
117 if key in self.listkeys: 118 self[key].append(value) 119 else: 120 dict.__setitem__(self, key, value)
121
122 - def __repr__(self):
123 text = dict.__repr__(self) 124 try: 125 attributes = self.attributes 126 except AttributeError: 127 return text 128 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
129 130
131 -class NotXMLError(ValueError):
132 - def __init__(self, message):
133 self.msg = message
134
135 - def __str__(self):
136 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
137 138
139 -class CorruptedXMLError(ValueError):
140 - def __init__(self, message):
141 self.msg = message
142
143 - def __str__(self):
144 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
145 146
147 -class ValidationError(ValueError):
148 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
149 - def __init__(self, name):
150 self.name = name
151
152 - def __str__(self):
153 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
154 155
156 -class DataHandler(object):
157 158 import platform 159 if platform.system() == 'Windows': 160 directory = os.path.join(os.getenv("APPDATA"), "biopython") 161 else: # Unix/Linux/Mac 162 home = os.path.expanduser('~') 163 directory = os.path.join(home, '.config', 'biopython') 164 del home 165 local_dtd_dir = os.path.join(directory, 'Bio', 'Entrez', 'DTDs') 166 local_xsd_dir = os.path.join(directory, 'Bio', 'Entrez', 'XSDs') 167 del directory 168 del platform 169 try: 170 os.makedirs(local_dtd_dir) # use exist_ok=True on Python >= 3.2 171 except OSError as exception: 172 # Check if local_dtd_dir already exists, and that it is a directory. 173 # Trying os.makedirs first and then checking for os.path.isdir avoids 174 # a race condition. 175 if not os.path.isdir(local_dtd_dir): 176 raise exception 177 try: 178 os.makedirs(local_xsd_dir) # use exist_ok=True on Python >= 3.2 179 except OSError as exception: 180 if not os.path.isdir(local_xsd_dir): 181 raise exception 182 183 from Bio import Entrez 184 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 185 global_xsd_dir = os.path.join(str(Entrez.__path__[0]), "XSDs") 186 del Entrez 187
188 - def __init__(self, validate):
189 self.stack = [] 190 self.errors = [] 191 self.integers = [] 192 self.strings = [] 193 self.lists = [] 194 self.dictionaries = [] 195 self.structures = {} 196 self.items = [] 197 self.dtd_urls = [] 198 self.validating = validate 199 self.parser = expat.ParserCreate(namespace_separator=" ") 200 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 201 self.parser.XmlDeclHandler = self.xmlDeclHandler 202 self.is_schema = False
203
204 - def read(self, handle):
205 """Set up the parser and let it parse the XML results""" 206 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 207 # expects binary data 208 if handle.__class__.__name__ == 'EvilHandleHack': 209 handle = handle._handle 210 if handle.__class__.__name__ == 'TextIOWrapper': 211 handle = handle.buffer 212 if hasattr(handle, "closed") and handle.closed: 213 # Should avoid a possible Segmentation Fault, see: 214 # http://bugs.python.org/issue4877 215 raise IOError("Can't parse a closed handle") 216 if sys.version_info[0] >= 3: 217 # Another nasty hack to cope with a unicode StringIO handle 218 # since the Entrez XML parser expects binary data (bytes) 219 from io import StringIO 220 if isinstance(handle, StringIO): 221 from io import BytesIO 222 from Bio._py3k import _as_bytes 223 handle = BytesIO(_as_bytes(handle.read())) 224 try: 225 self.parser.ParseFile(handle) 226 except expat.ExpatError as e: 227 if self.parser.StartElementHandler: 228 # We saw the initial <!xml declaration, so we can be sure that 229 # we are parsing XML data. Most likely, the XML file is 230 # corrupted. 231 raise CorruptedXMLError(e) 232 else: 233 # We have not seen the initial <!xml declaration, so probably 234 # the input data is not in XML format. 235 raise NotXMLError(e) 236 try: 237 return self.object 238 except AttributeError: 239 if self.parser.StartElementHandler: 240 # We saw the initial <!xml declaration, and expat didn't notice 241 # any errors, so self.object should be defined. If not, this is 242 # a bug. 243 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 244 else: 245 # We did not see the initial <!xml declaration, so probably 246 # the input data is not in XML format. 247 raise NotXMLError("XML declaration not found")
248
249 - def parse(self, handle):
250 BLOCK = 1024 251 while True: 252 # Read in another block of the file... 253 text = handle.read(BLOCK) 254 if not text: 255 # We have reached the end of the XML file 256 if self.stack: 257 # No more XML data, but there is still some unfinished 258 # business 259 raise CorruptedXMLError("Premature end of XML stream") 260 try: 261 for record in self.object: 262 yield record 263 except AttributeError: 264 if self.parser.StartElementHandler: 265 # We saw the initial <!xml declaration, and expat 266 # didn't notice any errors, so self.object should be 267 # defined. If not, this is a bug. 268 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 269 else: 270 # We did not see the initial <!xml declaration, so 271 # probably the input data is not in XML format. 272 raise NotXMLError("XML declaration not found") 273 self.parser.Parse("", True) 274 self.parser = None 275 return 276 277 try: 278 self.parser.Parse(text, False) 279 except expat.ExpatError as e: 280 if self.parser.StartElementHandler: 281 # We saw the initial <!xml declaration, so we can be sure 282 # that we are parsing XML data. Most likely, the XML file 283 # is corrupted. 284 raise CorruptedXMLError(e) 285 else: 286 # We have not seen the initial <!xml declaration, so 287 # probably the input data is not in XML format. 288 raise NotXMLError(e) 289 290 if not self.stack: 291 # Haven't read enough from the XML file yet 292 continue 293 294 records = self.stack[0] 295 if not isinstance(records, list): 296 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 297 while len(records) > 1: # Then the top record is finished 298 record = records.pop(0) 299 yield record
300
301 - def xmlDeclHandler(self, version, encoding, standalone):
302 # XML declaration found; set the handlers 303 self.parser.StartElementHandler = self.startElementHandler 304 self.parser.EndElementHandler = self.endElementHandler 305 self.parser.CharacterDataHandler = self.characterDataHandler 306 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 307 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
308
309 - def startNamespaceDeclHandler(self, prefix, un):
310 # This is an xml schema 311 if "Schema" in un: 312 self.is_schema = True 313 else: 314 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
315
316 - def startElementHandler(self, name, attrs):
317 # preprocessing the xml schema 318 if self.is_schema: 319 if len(attrs) == 1: 320 schema = list(attrs.values())[0] 321 handle = self.open_xsd_file(os.path.basename(schema)) 322 # if there is no local xsd file grab the url and parse the file 323 if not handle: 324 handle = _urlopen(schema) 325 text = handle.read() 326 self.save_xsd_file(os.path.basename(schema), text) 327 handle.close() 328 self.parse_xsd(ET.fromstring(text)) 329 else: 330 self.parse_xsd(ET.fromstring(handle.read())) 331 handle.close() 332 self.content = "" 333 if name in self.lists: 334 object = ListElement() 335 elif name in self.dictionaries: 336 object = DictionaryElement() 337 elif name in self.structures: 338 object = StructureElement(self.structures[name]) 339 elif name in self.items: # Only appears in ESummary 340 name = str(attrs["Name"]) # convert from Unicode 341 del attrs["Name"] 342 itemtype = str(attrs["Type"]) # convert from Unicode 343 del attrs["Type"] 344 if itemtype == "Structure": 345 object = DictionaryElement() 346 elif name in ("ArticleIds", "History"): 347 object = StructureElement(["pubmed", "medline"]) 348 elif itemtype == "List": 349 object = ListElement() 350 else: 351 object = StringElement() 352 object.itemname = name 353 object.itemtype = itemtype 354 elif name in self.strings + self.errors + self.integers: 355 self.attributes = attrs 356 return 357 else: 358 # Element not found in DTD 359 if self.validating: 360 raise ValidationError(name) 361 else: 362 # this will not be stored in the record 363 object = "" 364 if object != "": 365 object.tag = name 366 if attrs: 367 object.attributes = dict(attrs) 368 if len(self.stack) != 0: 369 current = self.stack[-1] 370 try: 371 current.append(object) 372 except AttributeError: 373 current[name] = object 374 self.stack.append(object)
375
376 - def endElementHandler(self, name):
377 value = self.content 378 if name in self.errors: 379 if value == "": 380 return 381 else: 382 raise RuntimeError(value) 383 elif name in self.integers: 384 value = IntegerElement(value) 385 elif name in self.strings: 386 # Convert Unicode strings to plain strings if possible 387 try: 388 value = StringElement(value) 389 except UnicodeEncodeError: 390 value = UnicodeElement(value) 391 elif name in self.items: 392 self.object = self.stack.pop() 393 if self.object.itemtype in ("List", "Structure"): 394 return 395 elif self.object.itemtype == "Integer" and value: 396 value = IntegerElement(value) 397 else: 398 # Convert Unicode strings to plain strings if possible 399 try: 400 value = StringElement(value) 401 except UnicodeEncodeError: 402 value = UnicodeElement(value) 403 name = self.object.itemname 404 else: 405 self.object = self.stack.pop() 406 value = re.sub(r"[\s]+", "", value) 407 if self.is_schema and value: 408 self.object.update({'data': value}) 409 return 410 value.tag = name 411 if self.attributes: 412 value.attributes = dict(self.attributes) 413 del self.attributes 414 current = self.stack[-1] 415 if current != "": 416 try: 417 current.append(value) 418 except AttributeError: 419 current[name] = value
420
421 - def characterDataHandler(self, content):
422 self.content += content
423
424 - def parse_xsd(self, root):
425 is_dictionary = False 426 name = "" 427 for child in root: 428 for element in child.getiterator(): 429 if "element" in element.tag: 430 if "name" in element.attrib: 431 name = element.attrib['name'] 432 if "attribute" in element.tag: 433 is_dictionary = True 434 if is_dictionary: 435 self.dictionaries.append(name) 436 is_dictionary = False 437 else: 438 self.lists.append(name)
439
440 - def elementDecl(self, name, model):
441 """This callback function is called for each element declaration: 442 <!ELEMENT name (...)> 443 encountered in a DTD. The purpose of this function is to determine 444 whether this element should be regarded as a string, integer, list 445 dictionary, structure, or error.""" 446 if name.upper() == "ERROR": 447 self.errors.append(name) 448 return 449 if name == 'Item' and model == (expat.model.XML_CTYPE_MIXED, 450 expat.model.XML_CQUANT_REP, 451 None, ((expat.model.XML_CTYPE_NAME, 452 expat.model.XML_CQUANT_NONE, 453 'Item', 454 () 455 ), 456 ) 457 ): 458 # Special case. As far as I can tell, this only occurs in the 459 # eSummary DTD. 460 self.items.append(name) 461 return 462 # First, remove ignorable parentheses around declarations 463 while (model[0] in (expat.model.XML_CTYPE_SEQ, 464 expat.model.XML_CTYPE_CHOICE) and 465 model[1] in (expat.model.XML_CQUANT_NONE, 466 expat.model.XML_CQUANT_OPT) and 467 len(model[3]) == 1): 468 model = model[3][0] 469 # PCDATA declarations correspond to strings 470 if model[0] in (expat.model.XML_CTYPE_MIXED, 471 expat.model.XML_CTYPE_EMPTY): 472 self.strings.append(name) 473 return 474 # List-type elements 475 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 476 expat.model.XML_CTYPE_SEQ) and 477 model[1] in (expat.model.XML_CQUANT_PLUS, 478 expat.model.XML_CQUANT_REP)): 479 self.lists.append(name) 480 return 481 # This is the tricky case. Check which keys can occur multiple 482 # times. If only one key is possible, and it can occur multiple 483 # times, then this is a list. If more than one key is possible, 484 # but none of them can occur multiple times, then this is a 485 # dictionary. Otherwise, this is a structure. 486 # In 'single' and 'multiple', we keep track which keys can occur 487 # only once, and which can occur multiple times. 488 single = [] 489 multiple = [] 490 # The 'count' function is called recursively to make sure all the 491 # children in this model are counted. Error keys are ignored; 492 # they raise an exception in Python. 493 494 def count(model): 495 quantifier, name, children = model[1:] 496 if name is None: 497 if quantifier in (expat.model.XML_CQUANT_PLUS, 498 expat.model.XML_CQUANT_REP): 499 for child in children: 500 multiple.append(child[2]) 501 else: 502 for child in children: 503 count(child) 504 elif name.upper() != "ERROR": 505 if quantifier in (expat.model.XML_CQUANT_NONE, 506 expat.model.XML_CQUANT_OPT): 507 single.append(name) 508 elif quantifier in (expat.model.XML_CQUANT_PLUS, 509 expat.model.XML_CQUANT_REP): 510 multiple.append(name)
511 count(model) 512 if len(single) == 0 and len(multiple) == 1: 513 self.lists.append(name) 514 elif len(multiple) == 0: 515 self.dictionaries.append(name) 516 else: 517 self.structures.update({name: multiple})
518
519 - def open_dtd_file(self, filename):
520 path = os.path.join(DataHandler.local_dtd_dir, filename) 521 try: 522 handle = open(path, "rb") 523 except IOError: 524 pass 525 else: 526 return handle 527 path = os.path.join(DataHandler.global_dtd_dir, filename) 528 try: 529 handle = open(path, "rb") 530 except IOError: 531 pass 532 else: 533 return handle 534 return None
535
536 - def open_xsd_file(self, filename):
537 path = os.path.join(DataHandler.local_xsd_dir, filename) 538 try: 539 handle = open(path, "rb") 540 except IOError: 541 pass 542 else: 543 return handle 544 path = os.path.join(DataHandler.global_xsd_dir, filename) 545 try: 546 handle = open(path, "rb") 547 except IOError: 548 pass 549 else: 550 return handle 551 return None
552
553 - def save_dtd_file(self, filename, text):
554 path = os.path.join(DataHandler.local_dtd_dir, filename) 555 try: 556 handle = open(path, "wb") 557 except IOError: 558 warnings.warn("Failed to save %s at %s" % (filename, path)) 559 else: 560 handle.write(text) 561 handle.close()
562
563 - def save_xsd_file(self, filename, text):
564 path = os.path.join(DataHandler.local_xsd_dir, filename) 565 try: 566 handle = open(path, "wb") 567 except IOError: 568 warnings.warn("Failed to save %s at %s" % (filename, path)) 569 else: 570 handle.write(text) 571 handle.close()
572
573 - def externalEntityRefHandler(self, context, base, systemId, publicId):
574 """The purpose of this function is to load the DTD locally, instead 575 of downloading it from the URL specified in the XML. Using the local 576 DTD results in much faster parsing. If the DTD is not found locally, 577 we try to download it. If new DTDs become available from NCBI, 578 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 579 urlinfo = _urlparse(systemId) 580 # Following attribute requires Python 2.5+ 581 # if urlinfo.scheme=='http': 582 if urlinfo[0] in ['http', 'https', 'ftp']: 583 # Then this is an absolute path to the DTD. 584 url = systemId 585 elif urlinfo[0] == '': 586 # Then this is a relative path to the DTD. 587 # Look at the parent URL to find the full path. 588 try: 589 source = self.dtd_urls[-1] 590 except IndexError: 591 # Assume the default URL for DTDs if the top parent 592 # does not contain an absolute path 593 source = "http://www.ncbi.nlm.nih.gov/dtd/" 594 else: 595 source = os.path.dirname(source) 596 # urls always have a forward slash, don't use os.path.join 597 url = source.rstrip("/") + "/" + systemId 598 else: 599 raise ValueError("Unexpected URL scheme %r" % (urlinfo[0])) 600 self.dtd_urls.append(url) 601 # First, try to load the local version of the DTD file 602 location, filename = os.path.split(systemId) 603 handle = self.open_dtd_file(filename) 604 if not handle: 605 # DTD is not available as a local file. Try accessing it through 606 # the internet instead. 607 try: 608 handle = _urlopen(url) 609 except IOError: 610 raise RuntimeError("Failed to access %s at %s" % (filename, url)) 611 text = handle.read() 612 handle.close() 613 self.save_dtd_file(filename, text) 614 handle = BytesIO(text) 615 616 parser = self.parser.ExternalEntityParserCreate(context) 617 parser.ElementDeclHandler = self.elementDecl 618 parser.ParseFile(handle) 619 handle.close() 620 self.dtd_urls.pop() 621 return 1
622