Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. 
  7   
  8  This parser is used by the read() function in Bio.Entrez, and is not 
  9  intended be used directly. 
 10   
 11  The question is how to represent an XML file as Python objects. Some 
 12  XML files returned by NCBI look like lists, others look like dictionaries, 
 13  and others look like a mix of lists and dictionaries. 
 14   
 15  My approach is to classify each possible element in the XML as a plain 
 16  string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  dictionary where the same key can occur multiple times; in Python, it is 
 18  represented as a dictionary where that key occurs once, pointing to a list 
 19  of values found in the XML file. 
 20   
 21  The parser then goes through the XML and creates the appropriate Python 
 22  object for each element. The different levels encountered in the XML are 
 23  preserved on the Python side. So a subelement of a subelement of an element 
 24  is a value in a dictionary that is stored in a list which is a value in 
 25  some other dictionary (or a value in a list which itself belongs to a list 
 26  which is a value in a dictionary, and so on). Attributes encountered in 
 27  the XML are stored as a dictionary in a member .attributes of each element, 
 28  and the tag name is saved in a member .tag. 
 29   
 30  To decide which kind of Python object corresponds to each element in the 
 31  XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  written solution, since the number of DTDs is rather large and their 
 34  contents may change over time. About half the code in this parser deals 
 35  wih parsing the DTD, and the other half with the XML itself. 
 36  """ 
 37   
 38  import os 
 39  import warnings 
 40  from xml.parsers import expat 
 41  from io import BytesIO 
 42   
 43  # Importing these functions with leading underscore as not intended for reuse 
 44  from Bio._py3k import urlopen as _urlopen 
 45  from Bio._py3k import urlparse as _urlparse 
 46  from Bio._py3k import unicode 
 47   
 48  __docformat__ = "restructuredtext en" 
 49   
 50  # The following four classes are used to add a member .attributes to integers, 
 51  # strings, lists, and dictionaries, respectively. 
 52   
 53   
54 -class IntegerElement(int):
55 - def __repr__(self):
56 text = int.__repr__(self) 57 try: 58 attributes = self.attributes 59 except AttributeError: 60 return text 61 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
62 63
64 -class StringElement(str):
65 - def __repr__(self):
66 text = str.__repr__(self) 67 try: 68 attributes = self.attributes 69 except AttributeError: 70 return text 71 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
72 73
74 -class UnicodeElement(unicode):
75 - def __repr__(self):
76 text = unicode.__repr__(self) 77 try: 78 attributes = self.attributes 79 except AttributeError: 80 return text 81 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
82 83
84 -class ListElement(list):
85 - def __repr__(self):
86 text = list.__repr__(self) 87 try: 88 attributes = self.attributes 89 except AttributeError: 90 return text 91 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
92 93
94 -class DictionaryElement(dict):
95 - def __repr__(self):
96 text = dict.__repr__(self) 97 try: 98 attributes = self.attributes 99 except AttributeError: 100 return text 101 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
102 103 104 # A StructureElement is like a dictionary, but some of its keys can have 105 # multiple values associated with it. These values are stored in a list 106 # under each key.
107 -class StructureElement(dict):
108 - def __init__(self, keys):
109 dict.__init__(self) 110 for key in keys: 111 dict.__setitem__(self, key, []) 112 self.listkeys = keys
113
114 - def __setitem__(self, key, value):
115 if key in self.listkeys: 116 self[key].append(value) 117 else: 118 dict.__setitem__(self, key, value)
119
120 - def __repr__(self):
121 text = dict.__repr__(self) 122 try: 123 attributes = self.attributes 124 except AttributeError: 125 return text 126 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
127 128
129 -class NotXMLError(ValueError):
130 - def __init__(self, message):
131 self.msg = message
132
133 - def __str__(self):
134 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
135 136
137 -class CorruptedXMLError(ValueError):
138 - def __init__(self, message):
139 self.msg = message
140
141 - def __str__(self):
142 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
143 144
145 -class ValidationError(ValueError):
146 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
147 - def __init__(self, name):
148 self.name = name
149
150 - def __str__(self):
151 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
152 153
154 -class DataHandler(object):
155 156 import platform 157 if platform.system() == 'Windows': 158 directory = os.path.join(os.getenv("APPDATA"), "biopython") 159 else: # Unix/Linux/Mac 160 home = os.path.expanduser('~') 161 directory = os.path.join(home, '.config', 'biopython') 162 del home 163 local_dtd_dir = os.path.join(directory, 'Bio', 'Entrez', 'DTDs') 164 del directory 165 del platform 166 try: 167 os.makedirs(local_dtd_dir) # use exist_ok=True on Python >= 3.2 168 except OSError as exception: 169 # Check if local_dtd_dir already exists, and that it is a directory. 170 # Trying os.makedirs first and then checking for os.path.isdir avoids 171 # a race condition. 172 if not os.path.isdir(local_dtd_dir): 173 raise exception 174 175 from Bio import Entrez 176 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 177 del Entrez 178
179 - def __init__(self, validate):
180 self.stack = [] 181 self.errors = [] 182 self.integers = [] 183 self.strings = [] 184 self.lists = [] 185 self.dictionaries = [] 186 self.structures = {} 187 self.items = [] 188 self.dtd_urls = [] 189 self.validating = validate 190 self.parser = expat.ParserCreate(namespace_separator=" ") 191 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 192 self.parser.XmlDeclHandler = self.xmlDeclHandler
193
194 - def read(self, handle):
195 """Set up the parser and let it parse the XML results""" 196 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 197 # expects binary data 198 if handle.__class__.__name__ == 'EvilHandleHack': 199 handle = handle._handle 200 if hasattr(handle, "closed") and handle.closed: 201 # Should avoid a possible Segmentation Fault, see: 202 # http://bugs.python.org/issue4877 203 raise IOError("Can't parse a closed handle") 204 try: 205 self.parser.ParseFile(handle) 206 except expat.ExpatError as e: 207 if self.parser.StartElementHandler: 208 # We saw the initial <!xml declaration, so we can be sure that 209 # we are parsing XML data. Most likely, the XML file is 210 # corrupted. 211 raise CorruptedXMLError(e) 212 else: 213 # We have not seen the initial <!xml declaration, so probably 214 # the input data is not in XML format. 215 raise NotXMLError(e) 216 try: 217 return self.object 218 except AttributeError: 219 if self.parser.StartElementHandler: 220 # We saw the initial <!xml declaration, and expat didn't notice 221 # any errors, so self.object should be defined. If not, this is 222 # a bug. 223 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 224 else: 225 # We did not see the initial <!xml declaration, so probably 226 # the input data is not in XML format. 227 raise NotXMLError("XML declaration not found")
228
229 - def parse(self, handle):
230 BLOCK = 1024 231 while True: 232 # Read in another block of the file... 233 text = handle.read(BLOCK) 234 if not text: 235 # We have reached the end of the XML file 236 if self.stack: 237 # No more XML data, but there is still some unfinished 238 # business 239 raise CorruptedXMLError 240 try: 241 for record in self.object: 242 yield record 243 except AttributeError: 244 if self.parser.StartElementHandler: 245 # We saw the initial <!xml declaration, and expat 246 # didn't notice any errors, so self.object should be 247 # defined. If not, this is a bug. 248 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 249 else: 250 # We did not see the initial <!xml declaration, so 251 # probably the input data is not in XML format. 252 raise NotXMLError("XML declaration not found") 253 self.parser.Parse("", True) 254 self.parser = None 255 return 256 257 try: 258 self.parser.Parse(text, False) 259 except expat.ExpatError as e: 260 if self.parser.StartElementHandler: 261 # We saw the initial <!xml declaration, so we can be sure 262 # that we are parsing XML data. Most likely, the XML file 263 # is corrupted. 264 raise CorruptedXMLError(e) 265 else: 266 # We have not seen the initial <!xml declaration, so 267 # probably the input data is not in XML format. 268 raise NotXMLError(e) 269 270 if not self.stack: 271 # Haven't read enough from the XML file yet 272 continue 273 274 records = self.stack[0] 275 if not isinstance(records, list): 276 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 277 while len(records) > 1: # Then the top record is finished 278 record = records.pop(0) 279 yield record
280
281 - def xmlDeclHandler(self, version, encoding, standalone):
282 # XML declaration found; set the handlers 283 self.parser.StartElementHandler = self.startElementHandler 284 self.parser.EndElementHandler = self.endElementHandler 285 self.parser.CharacterDataHandler = self.characterDataHandler 286 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 287 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
288
289 - def startNamespaceDeclHandler(self, prefix, un):
290 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
291
292 - def startElementHandler(self, name, attrs):
293 self.content = "" 294 if name in self.lists: 295 object = ListElement() 296 elif name in self.dictionaries: 297 object = DictionaryElement() 298 elif name in self.structures: 299 object = StructureElement(self.structures[name]) 300 elif name in self.items: # Only appears in ESummary 301 name = str(attrs["Name"]) # convert from Unicode 302 del attrs["Name"] 303 itemtype = str(attrs["Type"]) # convert from Unicode 304 del attrs["Type"] 305 if itemtype == "Structure": 306 object = DictionaryElement() 307 elif name in ("ArticleIds", "History"): 308 object = StructureElement(["pubmed", "medline"]) 309 elif itemtype == "List": 310 object = ListElement() 311 else: 312 object = StringElement() 313 object.itemname = name 314 object.itemtype = itemtype 315 elif name in self.strings + self.errors + self.integers: 316 self.attributes = attrs 317 return 318 else: 319 # Element not found in DTD 320 if self.validating: 321 raise ValidationError(name) 322 else: 323 # this will not be stored in the record 324 object = "" 325 if object != "": 326 object.tag = name 327 if attrs: 328 object.attributes = dict(attrs) 329 if len(self.stack) != 0: 330 current = self.stack[-1] 331 try: 332 current.append(object) 333 except AttributeError: 334 current[name] = object 335 self.stack.append(object)
336
337 - def endElementHandler(self, name):
338 value = self.content 339 if name in self.errors: 340 if value == "": 341 return 342 else: 343 raise RuntimeError(value) 344 elif name in self.integers: 345 value = IntegerElement(value) 346 elif name in self.strings: 347 # Convert Unicode strings to plain strings if possible 348 try: 349 value = StringElement(value) 350 except UnicodeEncodeError: 351 value = UnicodeElement(value) 352 elif name in self.items: 353 self.object = self.stack.pop() 354 if self.object.itemtype in ("List", "Structure"): 355 return 356 elif self.object.itemtype == "Integer" and value: 357 value = IntegerElement(value) 358 else: 359 # Convert Unicode strings to plain strings if possible 360 try: 361 value = StringElement(value) 362 except UnicodeEncodeError: 363 value = UnicodeElement(value) 364 name = self.object.itemname 365 else: 366 self.object = self.stack.pop() 367 return 368 value.tag = name 369 if self.attributes: 370 value.attributes = dict(self.attributes) 371 del self.attributes 372 current = self.stack[-1] 373 if current != "": 374 try: 375 current.append(value) 376 except AttributeError: 377 current[name] = value
378
379 - def characterDataHandler(self, content):
380 self.content += content
381
382 - def elementDecl(self, name, model):
383 """This callback function is called for each element declaration: 384 <!ELEMENT name (...)> 385 encountered in a DTD. The purpose of this function is to determine 386 whether this element should be regarded as a string, integer, list 387 dictionary, structure, or error.""" 388 if name.upper() == "ERROR": 389 self.errors.append(name) 390 return 391 if name == 'Item' and model == (expat.model.XML_CTYPE_MIXED, 392 expat.model.XML_CQUANT_REP, 393 None, ((expat.model.XML_CTYPE_NAME, 394 expat.model.XML_CQUANT_NONE, 395 'Item', 396 () 397 ), 398 ) 399 ): 400 # Special case. As far as I can tell, this only occurs in the 401 # eSummary DTD. 402 self.items.append(name) 403 return 404 # First, remove ignorable parentheses around declarations 405 while (model[0] in (expat.model.XML_CTYPE_SEQ, 406 expat.model.XML_CTYPE_CHOICE) 407 and model[1] in (expat.model.XML_CQUANT_NONE, 408 expat.model.XML_CQUANT_OPT) 409 and len(model[3]) == 1): 410 model = model[3][0] 411 # PCDATA declarations correspond to strings 412 if model[0] in (expat.model.XML_CTYPE_MIXED, 413 expat.model.XML_CTYPE_EMPTY): 414 self.strings.append(name) 415 return 416 # List-type elements 417 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 418 expat.model.XML_CTYPE_SEQ) and 419 model[1] in (expat.model.XML_CQUANT_PLUS, 420 expat.model.XML_CQUANT_REP)): 421 self.lists.append(name) 422 return 423 # This is the tricky case. Check which keys can occur multiple 424 # times. If only one key is possible, and it can occur multiple 425 # times, then this is a list. If more than one key is possible, 426 # but none of them can occur multiple times, then this is a 427 # dictionary. Otherwise, this is a structure. 428 # In 'single' and 'multiple', we keep track which keys can occur 429 # only once, and which can occur multiple times. 430 single = [] 431 multiple = [] 432 # The 'count' function is called recursively to make sure all the 433 # children in this model are counted. Error keys are ignored; 434 # they raise an exception in Python. 435 436 def count(model): 437 quantifier, name, children = model[1:] 438 if name is None: 439 if quantifier in (expat.model.XML_CQUANT_PLUS, 440 expat.model.XML_CQUANT_REP): 441 for child in children: 442 multiple.append(child[2]) 443 else: 444 for child in children: 445 count(child) 446 elif name.upper() != "ERROR": 447 if quantifier in (expat.model.XML_CQUANT_NONE, 448 expat.model.XML_CQUANT_OPT): 449 single.append(name) 450 elif quantifier in (expat.model.XML_CQUANT_PLUS, 451 expat.model.XML_CQUANT_REP): 452 multiple.append(name)
453 count(model) 454 if len(single) == 0 and len(multiple) == 1: 455 self.lists.append(name) 456 elif len(multiple) == 0: 457 self.dictionaries.append(name) 458 else: 459 self.structures.update({name: multiple})
460
461 - def open_dtd_file(self, filename):
462 path = os.path.join(DataHandler.local_dtd_dir, filename) 463 try: 464 handle = open(path, "rb") 465 except IOError: 466 pass 467 else: 468 return handle 469 path = os.path.join(DataHandler.global_dtd_dir, filename) 470 try: 471 handle = open(path, "rb") 472 except IOError: 473 pass 474 else: 475 return handle 476 return None
477
478 - def save_dtd_file(self, filename, text):
479 path = os.path.join(DataHandler.local_dtd_dir, filename) 480 try: 481 handle = open(path, "wb") 482 except IOError: 483 warnings.warn("Failed to save %s at %s" % (filename, path)) 484 else: 485 handle.write(text) 486 handle.close()
487
488 - def externalEntityRefHandler(self, context, base, systemId, publicId):
489 """The purpose of this function is to load the DTD locally, instead 490 of downloading it from the URL specified in the XML. Using the local 491 DTD results in much faster parsing. If the DTD is not found locally, 492 we try to download it. If new DTDs become available from NCBI, 493 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 494 urlinfo = _urlparse(systemId) 495 # Following attribute requires Python 2.5+ 496 # if urlinfo.scheme=='http': 497 if urlinfo[0] == 'http': 498 # Then this is an absolute path to the DTD. 499 url = systemId 500 elif urlinfo[0] == '': 501 # Then this is a relative path to the DTD. 502 # Look at the parent URL to find the full path. 503 try: 504 url = self.dtd_urls[-1] 505 except IndexError: 506 # Assume the default URL for DTDs if the top parent 507 # does not contain an absolute path 508 source = "http://www.ncbi.nlm.nih.gov/dtd/" 509 else: 510 source = os.path.dirname(url) 511 # urls always have a forward slash, don't use os.path.join 512 url = source.rstrip("/") + "/" + systemId 513 self.dtd_urls.append(url) 514 # First, try to load the local version of the DTD file 515 location, filename = os.path.split(systemId) 516 handle = self.open_dtd_file(filename) 517 if not handle: 518 # DTD is not available as a local file. Try accessing it through 519 # the internet instead. 520 try: 521 handle = _urlopen(url) 522 except IOError: 523 raise RuntimeError("Failed to access %s at %s" % (filename, url)) 524 text = handle.read() 525 handle.close() 526 self.save_dtd_file(filename, text) 527 handle = BytesIO(text) 528 529 parser = self.parser.ExternalEntityParserCreate(context) 530 parser.ElementDeclHandler = self.elementDecl 531 parser.ParseFile(handle) 532 handle.close() 533 self.dtd_urls.pop() 534 return 1
535