Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. This 
  7  parser is used by the read() function in Bio.Entrez, and is not intended 
  8  be used directly. 
  9  """ 
 10   
 11  # The question is how to represent an XML file as Python objects. Some 
 12  # XML files returned by NCBI look like lists, others look like dictionaries, 
 13  # and others look like a mix of lists and dictionaries. 
 14  # 
 15  # My approach is to classify each possible element in the XML as a plain 
 16  # string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  # dictionary where the same key can occur multiple times; in Python, it is 
 18  # represented as a dictionary where that key occurs once, pointing to a list 
 19  # of values found in the XML file. 
 20  # 
 21  # The parser then goes through the XML and creates the appropriate Python 
 22  # object for each element. The different levels encountered in the XML are 
 23  # preserved on the Python side. So a subelement of a subelement of an element 
 24  # is a value in a dictionary that is stored in a list which is a value in 
 25  # some other dictionary (or a value in a list which itself belongs to a list 
 26  # which is a value in a dictionary, and so on). Attributes encountered in 
 27  # the XML are stored as a dictionary in a member .attributes of each element, 
 28  # and the tag name is saved in a member .tag. 
 29  # 
 30  # To decide which kind of Python object corresponds to each element in the 
 31  # XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  # XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  # written solution, since the number of DTDs is rather large and their 
 34  # contents may change over time. About half the code in this parser deals 
 35  # wih parsing the DTD, and the other half with the XML itself. 
 36   
 37   
 38  import os.path 
 39  import urlparse 
 40  import urllib 
 41  import warnings 
 42  from xml.parsers import expat 
 43   
 44  # The following four classes are used to add a member .attributes to integers, 
 45  # strings, lists, and dictionaries, respectively. 
 46   
 47   
48 -class IntegerElement(int):
49 - def __repr__(self):
50 text = int.__repr__(self) 51 try: 52 attributes = self.attributes 53 except AttributeError: 54 return text 55 return "IntegerElement(%s, attributes=%s)" % (text, repr(attributes))
56 57
58 -class StringElement(str):
59 - def __repr__(self):
60 text = str.__repr__(self) 61 try: 62 attributes = self.attributes 63 except AttributeError: 64 return text 65 return "StringElement(%s, attributes=%s)" % (text, repr(attributes))
66 67
68 -class UnicodeElement(unicode):
69 - def __repr__(self):
70 text = unicode.__repr__(self) 71 try: 72 attributes = self.attributes 73 except AttributeError: 74 return text 75 return "UnicodeElement(%s, attributes=%s)" % (text, repr(attributes))
76 77
78 -class ListElement(list):
79 - def __repr__(self):
80 text = list.__repr__(self) 81 try: 82 attributes = self.attributes 83 except AttributeError: 84 return text 85 return "ListElement(%s, attributes=%s)" % (text, repr(attributes))
86 87
88 -class DictionaryElement(dict):
89 - def __repr__(self):
90 text = dict.__repr__(self) 91 try: 92 attributes = self.attributes 93 except AttributeError: 94 return text 95 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
96 97 98 # A StructureElement is like a dictionary, but some of its keys can have 99 # multiple values associated with it. These values are stored in a list 100 # under each key.
101 -class StructureElement(dict):
102 - def __init__(self, keys):
103 dict.__init__(self) 104 for key in keys: 105 dict.__setitem__(self, key, []) 106 self.listkeys = keys
107
108 - def __setitem__(self, key, value):
109 if key in self.listkeys: 110 self[key].append(value) 111 else: 112 dict.__setitem__(self, key, value)
113
114 - def __repr__(self):
115 text = dict.__repr__(self) 116 try: 117 attributes = self.attributes 118 except AttributeError: 119 return text 120 return "DictElement(%s, attributes=%s)" % (text, repr(attributes))
121 122
123 -class NotXMLError(ValueError):
124 - def __init__(self, message):
125 self.msg = message
126
127 - def __str__(self):
128 return "Failed to parse the XML data (%s). Please make sure that the input data are in XML format." % self.msg
129 130
131 -class CorruptedXMLError(ValueError):
132 - def __init__(self, message):
133 self.msg = message
134
135 - def __str__(self):
136 return "Failed to parse the XML data (%s). Please make sure that the input data are not corrupted." % self.msg
137 138
139 -class ValidationError(ValueError):
140 """Validating parsers raise this error if the parser finds a tag in the XML that is not defined in the DTD. Non-validating parsers do not raise this error. The Bio.Entrez.read and Bio.Entrez.parse functions use validating parsers by default (see those functions for more information)"""
141 - def __init__(self, name):
142 self.name = name
143
144 - def __str__(self):
145 return "Failed to find tag '%s' in the DTD. To skip all tags that are not represented in the DTD, please call Bio.Entrez.read or Bio.Entrez.parse with validate=False." % self.name
146 147
148 -class DataHandler(object):
149 150 home = os.path.expanduser('~') 151 local_dtd_dir = os.path.join(home, '.biopython', 'Bio', 'Entrez', 'DTDs') 152 del home 153 154 from Bio import Entrez 155 global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs") 156 del Entrez 157
158 - def __init__(self, validate):
159 self.stack = [] 160 self.errors = [] 161 self.integers = [] 162 self.strings = [] 163 self.lists = [] 164 self.dictionaries = [] 165 self.structures = {} 166 self.items = [] 167 self.dtd_urls = [] 168 self.validating = validate 169 self.parser = expat.ParserCreate(namespace_separator=" ") 170 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 171 self.parser.XmlDeclHandler = self.xmlDeclHandler
172
173 - def read(self, handle):
174 """Set up the parser and let it parse the XML results""" 175 # HACK: remove Bio._py3k handle conversion, since the Entrez XML parser 176 # expects binary data 177 if handle.__class__.__name__ == 'EvilHandleHack': 178 handle = handle._handle 179 if hasattr(handle, "closed") and handle.closed: 180 #Should avoid a possible Segmentation Fault, see: 181 #http://bugs.python.org/issue4877 182 raise IOError("Can't parse a closed handle") 183 try: 184 self.parser.ParseFile(handle) 185 except expat.ExpatError, e: 186 if self.parser.StartElementHandler: 187 # We saw the initial <!xml declaration, so we can be sure that 188 # we are parsing XML data. Most likely, the XML file is 189 # corrupted. 190 raise CorruptedXMLError(e) 191 else: 192 # We have not seen the initial <!xml declaration, so probably 193 # the input data is not in XML format. 194 raise NotXMLError(e) 195 try: 196 return self.object 197 except AttributeError: 198 if self.parser.StartElementHandler: 199 # We saw the initial <!xml declaration, and expat didn't notice 200 # any errors, so self.object should be defined. If not, this is 201 # a bug. 202 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 203 else: 204 # We did not see the initial <!xml declaration, so probably 205 # the input data is not in XML format. 206 raise NotXMLError("XML declaration not found")
207
208 - def parse(self, handle):
209 BLOCK = 1024 210 while True: 211 #Read in another block of the file... 212 text = handle.read(BLOCK) 213 if not text: 214 # We have reached the end of the XML file 215 if self.stack: 216 # No more XML data, but there is still some unfinished 217 # business 218 raise CorruptedXMLError 219 try: 220 for record in self.object: 221 yield record 222 except AttributeError: 223 if self.parser.StartElementHandler: 224 # We saw the initial <!xml declaration, and expat 225 # didn't notice any errors, so self.object should be 226 # defined. If not, this is a bug. 227 raise RuntimeError("Failed to parse the XML file correctly, possibly due to a bug in Bio.Entrez. Please contact the Biopython developers at biopython-dev@biopython.org for assistance.") 228 else: 229 # We did not see the initial <!xml declaration, so 230 # probably the input data is not in XML format. 231 raise NotXMLError("XML declaration not found") 232 self.parser.Parse("", True) 233 self.parser = None 234 return 235 236 try: 237 self.parser.Parse(text, False) 238 except expat.ExpatError, e: 239 if self.parser.StartElementHandler: 240 # We saw the initial <!xml declaration, so we can be sure 241 # that we are parsing XML data. Most likely, the XML file 242 # is corrupted. 243 raise CorruptedXMLError(e) 244 else: 245 # We have not seen the initial <!xml declaration, so 246 # probably the input data is not in XML format. 247 raise NotXMLError(e) 248 249 if not self.stack: 250 # Haven't read enough from the XML file yet 251 continue 252 253 records = self.stack[0] 254 if not isinstance(records, list): 255 raise ValueError("The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse") 256 while len(records) > 1: # Then the top record is finished 257 record = records.pop(0) 258 yield record
259
260 - def xmlDeclHandler(self, version, encoding, standalone):
261 # XML declaration found; set the handlers 262 self.parser.StartElementHandler = self.startElementHandler 263 self.parser.EndElementHandler = self.endElementHandler 264 self.parser.CharacterDataHandler = self.characterDataHandler 265 self.parser.ExternalEntityRefHandler = self.externalEntityRefHandler 266 self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler
267
268 - def startNamespaceDeclHandler(self, prefix, un):
269 raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
270
271 - def startElementHandler(self, name, attrs):
272 self.content = "" 273 if name in self.lists: 274 object = ListElement() 275 elif name in self.dictionaries: 276 object = DictionaryElement() 277 elif name in self.structures: 278 object = StructureElement(self.structures[name]) 279 elif name in self.items: # Only appears in ESummary 280 name = str(attrs["Name"]) # convert from Unicode 281 del attrs["Name"] 282 itemtype = str(attrs["Type"]) # convert from Unicode 283 del attrs["Type"] 284 if itemtype=="Structure": 285 object = DictionaryElement() 286 elif name in ("ArticleIds", "History"): 287 object = StructureElement(["pubmed", "medline"]) 288 elif itemtype=="List": 289 object = ListElement() 290 else: 291 object = StringElement() 292 object.itemname = name 293 object.itemtype = itemtype 294 elif name in self.strings + self.errors + self.integers: 295 self.attributes = attrs 296 return 297 else: 298 # Element not found in DTD 299 if self.validating: 300 raise ValidationError(name) 301 else: 302 # this will not be stored in the record 303 object = "" 304 if object!="": 305 object.tag = name 306 if attrs: 307 object.attributes = dict(attrs) 308 if len(self.stack)!=0: 309 current = self.stack[-1] 310 try: 311 current.append(object) 312 except AttributeError: 313 current[name] = object 314 self.stack.append(object)
315
316 - def endElementHandler(self, name):
317 value = self.content 318 if name in self.errors: 319 if value=="": 320 return 321 else: 322 raise RuntimeError(value) 323 elif name in self.integers: 324 value = IntegerElement(value) 325 elif name in self.strings: 326 # Convert Unicode strings to plain strings if possible 327 try: 328 value = StringElement(value) 329 except UnicodeEncodeError: 330 value = UnicodeElement(value) 331 elif name in self.items: 332 self.object = self.stack.pop() 333 if self.object.itemtype in ("List", "Structure"): 334 return 335 elif self.object.itemtype=="Integer" and value: 336 value = IntegerElement(value) 337 else: 338 # Convert Unicode strings to plain strings if possible 339 try: 340 value = StringElement(value) 341 except UnicodeEncodeError: 342 value = UnicodeElement(value) 343 name = self.object.itemname 344 else: 345 self.object = self.stack.pop() 346 return 347 value.tag = name 348 if self.attributes: 349 value.attributes = dict(self.attributes) 350 del self.attributes 351 current = self.stack[-1] 352 if current!="": 353 try: 354 current.append(value) 355 except AttributeError: 356 current[name] = value
357
358 - def characterDataHandler(self, content):
359 self.content += content
360
361 - def elementDecl(self, name, model):
362 """This callback function is called for each element declaration: 363 <!ELEMENT name (...)> 364 encountered in a DTD. The purpose of this function is to determine 365 whether this element should be regarded as a string, integer, list 366 dictionary, structure, or error.""" 367 if name.upper()=="ERROR": 368 self.errors.append(name) 369 return 370 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 371 expat.model.XML_CQUANT_REP, 372 None, ((expat.model.XML_CTYPE_NAME, 373 expat.model.XML_CQUANT_NONE, 374 'Item', 375 () 376 ), 377 ) 378 ): 379 # Special case. As far as I can tell, this only occurs in the 380 # eSummary DTD. 381 self.items.append(name) 382 return 383 # First, remove ignorable parentheses around declarations 384 while (model[0] in (expat.model.XML_CTYPE_SEQ, 385 expat.model.XML_CTYPE_CHOICE) 386 and model[1] in (expat.model.XML_CQUANT_NONE, 387 expat.model.XML_CQUANT_OPT) 388 and len(model[3])==1): 389 model = model[3][0] 390 # PCDATA declarations correspond to strings 391 if model[0] in (expat.model.XML_CTYPE_MIXED, 392 expat.model.XML_CTYPE_EMPTY): 393 self.strings.append(name) 394 return 395 # List-type elements 396 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 397 expat.model.XML_CTYPE_SEQ) and 398 model[1] in (expat.model.XML_CQUANT_PLUS, 399 expat.model.XML_CQUANT_REP)): 400 self.lists.append(name) 401 return 402 # This is the tricky case. Check which keys can occur multiple 403 # times. If only one key is possible, and it can occur multiple 404 # times, then this is a list. If more than one key is possible, 405 # but none of them can occur multiple times, then this is a 406 # dictionary. Otherwise, this is a structure. 407 # In 'single' and 'multiple', we keep track which keys can occur 408 # only once, and which can occur multiple times. 409 single = [] 410 multiple = [] 411 # The 'count' function is called recursively to make sure all the 412 # children in this model are counted. Error keys are ignored; 413 # they raise an exception in Python. 414 415 def count(model): 416 quantifier, name, children = model[1:] 417 if name is None: 418 if quantifier in (expat.model.XML_CQUANT_PLUS, 419 expat.model.XML_CQUANT_REP): 420 for child in children: 421 multiple.append(child[2]) 422 else: 423 for child in children: 424 count(child) 425 elif name.upper()!="ERROR": 426 if quantifier in (expat.model.XML_CQUANT_NONE, 427 expat.model.XML_CQUANT_OPT): 428 single.append(name) 429 elif quantifier in (expat.model.XML_CQUANT_PLUS, 430 expat.model.XML_CQUANT_REP): 431 multiple.append(name)
432 count(model) 433 if len(single)==0 and len(multiple)==1: 434 self.lists.append(name) 435 elif len(multiple)==0: 436 self.dictionaries.append(name) 437 else: 438 self.structures.update({name: multiple})
439
440 - def open_dtd_file(self, filename):
441 path = os.path.join(DataHandler.local_dtd_dir, filename) 442 try: 443 handle = open(path, "rb") 444 except IOError: 445 pass 446 else: 447 return handle 448 path = os.path.join(DataHandler.global_dtd_dir, filename) 449 try: 450 handle = open(path, "rb") 451 except IOError: 452 pass 453 else: 454 return handle 455 return None
456
457 - def externalEntityRefHandler(self, context, base, systemId, publicId):
458 """The purpose of this function is to load the DTD locally, instead 459 of downloading it from the URL specified in the XML. Using the local 460 DTD results in much faster parsing. If the DTD is not found locally, 461 we try to download it. If new DTDs become available from NCBI, 462 putting them in Bio/Entrez/DTDs will allow the parser to see them.""" 463 urlinfo = urlparse.urlparse(systemId) 464 #Following attribute requires Python 2.5+ 465 #if urlinfo.scheme=='http': 466 if urlinfo[0]=='http': 467 # Then this is an absolute path to the DTD. 468 url = systemId 469 elif urlinfo[0]=='': 470 # Then this is a relative path to the DTD. 471 # Look at the parent URL to find the full path. 472 try: 473 url = self.dtd_urls[-1] 474 except IndexError: 475 # Assume the default URL for DTDs if the top parent 476 # does not contain an absolute path 477 source = "http://www.ncbi.nlm.nih.gov/dtd/" 478 else: 479 source = os.path.dirname(url) 480 # urls always have a forward slash, don't use os.path.join 481 url = source.rstrip("/") + "/" + systemId 482 self.dtd_urls.append(url) 483 # First, try to load the local version of the DTD file 484 location, filename = os.path.split(systemId) 485 handle = self.open_dtd_file(filename) 486 if not handle: 487 # DTD is not available as a local file. Try accessing it through 488 # the internet instead. 489 message = """\ 490 Unable to load DTD file %s. 491 492 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez. 493 Though most of NCBI's DTD files are included in the Biopython distribution, 494 sometimes you may find that a particular DTD file is missing. While we can 495 access the DTD file through the internet, the parser is much faster if the 496 required DTD files are available locally. 497 498 For this purpose, please download %s from 499 500 %s 501 502 and save it either in directory 503 504 %s 505 506 or in directory 507 508 %s 509 510 in order for Bio.Entrez to find it. 511 512 Alternatively, you can save %s in the directory 513 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython. 514 515 Please also inform the Biopython developers about this missing DTD, by 516 reporting a bug on http://bugzilla.open-bio.org/ or sign up to our mailing 517 list and emailing us, so that we can include it with the next release of 518 Biopython. 519 520 Proceeding to access the DTD file through the internet... 521 """ % (filename, filename, url, self.global_dtd_dir, self.local_dtd_dir, filename) 522 warnings.warn(message) 523 try: 524 handle = urllib.urlopen(url) 525 except IOError: 526 raise RuntimeException("Failed to access %s at %s" % (filename, url)) 527 528 parser = self.parser.ExternalEntityParserCreate(context) 529 parser.ElementDeclHandler = self.elementDecl 530 parser.ParseFile(handle) 531 handle.close() 532 self.dtd_urls.pop() 533 return 1
534