Package Bio :: Package SeqIO :: Module UniprotIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.UniprotIO

  1  # Copyright 2010 by Andrea Pierleoni 
  2  # Revisions copyright 2010 by Peter Cock 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "uniprot-xml" file format. 
 10   
 11  See also: 
 12   
 13  http://www.uniprot.org 
 14   
 15  The UniProt XML format essentially replaces the old plain text file format 
 16  originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 
 17  """ 
 18  import sys 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqFeature 
 22  from Bio import Alphabet 
 23  from Bio.SeqRecord import SeqRecord 
 24  from Bio._py3k import StringIO 
 25   
 26   
 27  #For speed try to use cElementTree rather than ElementTree 
 28  try: 
 29      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 30          # Workaround for bug in python 3.0 and 3.1, 
 31          # see http://bugs.python.org/issue9257 
 32          from xml.etree import ElementTree as ElementTree 
 33      else: 
 34          from xml.etree import cElementTree as ElementTree 
 35  except ImportError: 
 36      from xml.etree import ElementTree as ElementTree 
 37   
 38  NS = "{http://uniprot.org/uniprot}" 
 39  REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 
 40   
 41   
42 -def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
43 """Generator function to parse UniProt XML as SeqRecord objects. 44 45 parses an XML entry at a time from any UniProt XML file 46 returns a SeqRecord for each iteration 47 48 This generator can be used in Bio.SeqIO 49 50 return_raw_comments = True --> comment fields are returned as complete XML to allow further processing 51 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 52 """ 53 if isinstance(alphabet, Alphabet.NucleotideAlphabet): 54 raise ValueError("Wrong alphabet %r" % alphabet) 55 if isinstance(alphabet, Alphabet.Gapped): 56 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): 57 raise ValueError("Wrong alphabet %r" % alphabet) 58 59 if not hasattr(handle, "read"): 60 if isinstance(handle, str): 61 handle = StringIO(handle) 62 else: 63 raise Exception('An XML-containing handler or an XML string must be passed') 64 65 if ElementTree is None: 66 from Bio import MissingExternalDependencyError 67 raise MissingExternalDependencyError( 68 "No ElementTree module was found. " 69 "Use Python 2.5+, lxml or elementtree if you " 70 "want to use Bio.SeqIO.UniprotIO.") 71 72 for event, elem in ElementTree.iterparse(handle, events=("start", "end")): 73 if event == "end" and elem.tag == NS + "entry": 74 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() 75 elem.clear()
76 77
78 -class Parser(object):
79 """Parse a UniProt XML entry to a SeqRecord. 80 81 return_raw_comments=True to get back the complete comment field in XML format 82 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet. 83 """
84 - def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
85 self.entry = elem 86 self.alphabet = alphabet 87 self.return_raw_comments = return_raw_comments
88
89 - def parse(self):
90 """Parse the input.""" 91 assert self.entry.tag == NS + 'entry' 92 93 def append_to_annotations(key, value): 94 if key not in self.ParsedSeqRecord.annotations: 95 self.ParsedSeqRecord.annotations[key] = [] 96 if value not in self.ParsedSeqRecord.annotations[key]: 97 self.ParsedSeqRecord.annotations[key].append(value)
98 99 def _parse_name(element): 100 self.ParsedSeqRecord.name = element.text 101 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)
102 103 def _parse_accession(element): 104 append_to_annotations('accessions', element.text) # to cope with SwissProt plain text parser 105 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text) 106 107 def _parse_protein(element): 108 """Parse protein names (PRIVATE).""" 109 descr_set = False 110 for protein_element in element: 111 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']: # recommendedName tag are parsed before 112 #use protein fields for name and description 113 for rec_name in protein_element: 114 ann_key = '%s_%s' % (protein_element.tag.replace(NS, ''), 115 rec_name.tag.replace(NS, '')) 116 append_to_annotations(ann_key, rec_name.text) 117 if (rec_name.tag == NS + 'fullName') and not descr_set: 118 self.ParsedSeqRecord.description = rec_name.text 119 descr_set = True 120 elif protein_element.tag == NS + 'component': 121 pass # not parsed 122 elif protein_element.tag == NS + 'domain': 123 pass # not parsed 124 125 def _parse_gene(element): 126 for genename_element in element: 127 if 'type' in genename_element.attrib: 128 ann_key = 'gene_%s_%s' % (genename_element.tag.replace(NS, ''), 129 genename_element.attrib['type']) 130 if genename_element.attrib['type'] == 'primary': 131 self.ParsedSeqRecord.annotations[ann_key] = genename_element.text 132 else: 133 append_to_annotations(ann_key, genename_element.text) 134 135 def _parse_geneLocation(element): 136 append_to_annotations('geneLocation', element.attrib['type']) 137 138 def _parse_organism(element): 139 organism_name = com_name = sci_name = '' 140 for organism_element in element: 141 if organism_element.tag == NS + 'name': 142 if organism_element.text: 143 if organism_element.attrib['type'] == 'scientific': 144 sci_name = organism_element.text 145 elif organism_element.attrib['type'] == 'common': 146 com_name = organism_element.text 147 else: 148 #e.g. synonym 149 append_to_annotations("organism_name", organism_element.text) 150 elif organism_element.tag == NS + 'dbReference': 151 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type'] + ':' + organism_element.attrib['id']) 152 elif organism_element.tag == NS + 'lineage': 153 for taxon_element in organism_element: 154 if taxon_element.tag == NS + 'taxon': 155 append_to_annotations('taxonomy', taxon_element.text) 156 if sci_name and com_name: 157 organism_name = '%s (%s)' % (sci_name, com_name) 158 elif sci_name: 159 organism_name = sci_name 160 elif com_name: 161 organism_name = com_name 162 self.ParsedSeqRecord.annotations['organism'] = organism_name 163 164 def _parse_organismHost(element): 165 for organism_element in element: 166 if organism_element.tag == NS + 'name': 167 append_to_annotations("organism_host", organism_element.text) 168 169 def _parse_keyword(element): 170 append_to_annotations('keywords', element.text) 171 172 def _parse_comment(element): 173 """Parse comments (PRIVATE). 174 175 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 176 To store all the contained data, more complex data structures are needed, such as 177 annotated dictionaries. This is left to end user, by optionally setting: 178 179 return_raw_comments=True 180 181 The original XML is returned in the annotation fields. 182 183 Available comment types at december 2009: 184 "allergen" 185 "alternative products" 186 "biotechnology" 187 "biophysicochemical properties" 188 "catalytic activity" 189 "caution" 190 "cofactor" 191 "developmental stage" 192 "disease" 193 "domain" 194 "disruption phenotype" 195 "enzyme regulation" 196 "function" 197 "induction" 198 "miscellaneous" 199 "pathway" 200 "pharmaceutical" 201 "polymorphism" 202 "PTM" 203 "RNA editing" 204 "similarity" 205 "subcellular location" 206 "sequence caution" 207 "subunit" 208 "tissue specificity" 209 "toxic dose" 210 "online information" 211 "mass spectrometry" 212 "interaction" 213 """ 214 215 simple_comments = ["allergen", 216 "biotechnology", 217 "biophysicochemical properties", 218 "catalytic activity", 219 "caution", 220 "cofactor", 221 "developmental stage", 222 "disease", 223 "domain", 224 "disruption phenotype", 225 "enzyme regulation", 226 "function", 227 "induction", 228 "miscellaneous", 229 "pathway", 230 "pharmaceutical", 231 "polymorphism", 232 "PTM", 233 "RNA editing", # positions not parsed 234 "similarity", 235 "subunit", 236 "tissue specificity", 237 "toxic dose", 238 ] 239 240 if element.attrib['type'] in simple_comments: 241 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 242 for text_element in element.getiterator(NS + 'text'): 243 if text_element.text: 244 append_to_annotations(ann_key, text_element.text) 245 elif element.attrib['type'] == 'subcellular location': 246 for subloc_element in element.getiterator(NS + 'subcellularLocation'): 247 for el in subloc_element: 248 if el.text: 249 ann_key = 'comment_%s_%s' % (element.attrib['type'].replace(' ', ''), el.tag.replace(NS, '')) 250 append_to_annotations(ann_key, el.text) 251 elif element.attrib['type'] == 'interaction': 252 for interact_element in element.getiterator(NS + 'interactant'): 253 ann_key = 'comment_%s_intactId' % element.attrib['type'] 254 append_to_annotations(ann_key, interact_element.attrib['intactId']) 255 elif element.attrib['type'] == 'alternative products': 256 for alt_element in element.getiterator(NS + 'isoform'): 257 ann_key = 'comment_%s_isoform' % element.attrib['type'].replace(' ', '') 258 for id_element in alt_element.getiterator(NS + 'id'): 259 append_to_annotations(ann_key, id_element.text) 260 elif element.attrib['type'] == 'mass spectrometry': 261 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 262 start = end = 0 263 for loc_element in element.getiterator(NS + 'location'): 264 pos_els = loc_element.getiterator(NS + 'position') 265 pos_els = list(pos_els) 266 # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry 267 try: 268 if pos_els: 269 end = int(pos_els[0].attrib['position']) 270 start = end - 1 271 else: 272 start = int(list(loc_element.getiterator(NS + 'begin'))[0].attrib['position']) - 1 273 end = int(list(loc_element.getiterator(NS + 'end'))[0].attrib['position']) 274 except: # undefined positions or erroneously mapped 275 pass 276 mass = element.attrib['mass'] 277 method = element.attrib['method'] 278 if start == end == 0: 279 append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method)) 280 else: 281 append_to_annotations(ann_key, '%s..%s:%s|%s' % (start, end, mass, method)) 282 elif element.attrib['type'] == 'sequence caution': 283 pass # not parsed: few information, complex structure 284 elif element.attrib['type'] == 'online information': 285 for link_element in element.getiterator(NS + 'link'): 286 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 287 for id_element in link_element.getiterator(NS + 'link'): 288 append_to_annotations(ann_key, 289 '%s@%s' % (element.attrib['name'], link_element.attrib['uri'])) 290 291 #return raw XML comments if needed 292 if self.return_raw_comments: 293 ann_key = 'comment_%s_xml' % element.attrib['type'].replace(' ', '') 294 append_to_annotations(ann_key, ElementTree.tostring(element)) 295 296 def _parse_dbReference(element): 297 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id']) 298 #e.g. 299 # <dbReference type="PDB" key="11" id="2GEZ"> 300 # <property value="X-ray" type="method"/> 301 # <property value="2.60 A" type="resolution"/> 302 # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 303 # </dbReference> 304 if 'type' in element.attrib: 305 if element.attrib['type'] == 'PDB': 306 method = "" 307 resolution = "" 308 for ref_element in element: 309 if ref_element.tag == NS + 'property': 310 dat_type = ref_element.attrib['type'] 311 if dat_type == 'method': 312 method = ref_element.attrib['value'] 313 if dat_type == 'resolution': 314 resolution = ref_element.attrib['value'] 315 if dat_type == 'chains': 316 pairs = ref_element.attrib['value'].split(',') 317 for elem in pairs: 318 pair = elem.strip().split('=') 319 if pair[1] != '-': 320 #TODO - How best to store these, do SeqFeatures make sense? 321 feature = SeqFeature.SeqFeature() 322 feature.type = element.attrib['type'] 323 feature.qualifiers['name'] = element.attrib['id'] 324 feature.qualifiers['method'] = method 325 feature.qualifiers['resolution'] = resolution 326 feature.qualifiers['chains'] = pair[0].split('/') 327 start = int(pair[1].split('-')[0]) - 1 328 end = int(pair[1].split('-')[1]) 329 feature.location = SeqFeature.FeatureLocation(start, end) 330 #self.ParsedSeqRecord.features.append(feature) 331 332 for ref_element in element: 333 if ref_element.tag == NS + 'property': 334 pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 335 336 def _parse_reference(element): 337 reference = SeqFeature.Reference() 338 authors = [] 339 scopes = [] 340 tissues = [] 341 journal_name = '' 342 pub_type = '' 343 pub_date = '' 344 for ref_element in element: 345 if ref_element.tag == NS + 'citation': 346 pub_type = ref_element.attrib['type'] 347 if pub_type == 'submission': 348 pub_type += ' to the ' + ref_element.attrib['db'] 349 if 'name' in ref_element.attrib: 350 journal_name = ref_element.attrib['name'] 351 pub_date = ref_element.attrib.get('date', '') 352 j_volume = ref_element.attrib.get('volume', '') 353 j_first = ref_element.attrib.get('first', '') 354 j_last = ref_element.attrib.get('last', '') 355 for cit_element in ref_element: 356 if cit_element.tag == NS + 'title': 357 reference.title = cit_element.text 358 elif cit_element.tag == NS + 'authorList': 359 for person_element in cit_element: 360 authors.append(person_element.attrib['name']) 361 elif cit_element.tag == NS + 'dbReference': 362 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] 363 + ':' + cit_element.attrib['id']) 364 if cit_element.attrib['type'] == 'PubMed': 365 reference.pubmed_id = cit_element.attrib['id'] 366 elif ref_element.attrib['type'] == 'MEDLINE': 367 reference.medline_id = cit_element.attrib['id'] 368 elif ref_element.tag == NS + 'scope': 369 scopes.append(ref_element.text) 370 elif ref_element.tag == NS + 'source': 371 for source_element in ref_element: 372 if source_element.tag == NS + 'tissue': 373 tissues.append(source_element.text) 374 if scopes: 375 scopes_str = 'Scope: ' + ', '.join(scopes) 376 else: 377 scopes_str = '' 378 if tissues: 379 tissues_str = 'Tissue: ' + ', '.join(tissues) 380 else: 381 tissues_str = '' 382 383 # locations cannot be parsed since they are actually written in 384 # free text inside scopes so all the references are put in the 385 # annotation. 386 reference.location = [] 387 reference.authors = ', '.join(authors) 388 if journal_name: 389 if pub_date and j_volume and j_first and j_last: 390 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, 391 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) 392 else: 393 reference.journal = journal_name 394 reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) 395 append_to_annotations('references', reference) 396 397 def _parse_position(element, offset=0): 398 try: 399 position = int(element.attrib['position']) + offset 400 except KeyError as err: 401 position = None 402 status = element.attrib.get('status', '') 403 if status == 'unknown': 404 assert position is None 405 return SeqFeature.UnknownPosition() 406 elif not status: 407 return SeqFeature.ExactPosition(position) 408 elif status == 'greater than': 409 return SeqFeature.AfterPosition(position) 410 elif status == 'less than': 411 return SeqFeature.BeforePosition(position) 412 elif status == 'uncertain': 413 return SeqFeature.UncertainPosition(position) 414 else: 415 raise NotImplementedError("Position status %r" % status) 416 417 def _parse_feature(element): 418 feature = SeqFeature.SeqFeature() 419 for k, v in element.attrib.items(): 420 feature.qualifiers[k] = v 421 feature.type = element.attrib.get('type', '') 422 if 'id' in element.attrib: 423 feature.id = element.attrib['id'] 424 for feature_element in element: 425 if feature_element.tag == NS + 'location': 426 position_elements = feature_element.findall(NS + 'position') 427 if position_elements: 428 element = position_elements[0] 429 start_position = _parse_position(element, -1) 430 end_position = _parse_position(element) 431 else: 432 element = feature_element.findall(NS + 'begin')[0] 433 start_position = _parse_position(element, -1) 434 element = feature_element.findall(NS + 'end')[0] 435 end_position = _parse_position(element) 436 feature.location = SeqFeature.FeatureLocation(start_position, end_position) 437 else: 438 try: 439 feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text 440 except: 441 pass # skip unparsable tag 442 self.ParsedSeqRecord.features.append(feature) 443 444 def _parse_proteinExistence(element): 445 append_to_annotations('proteinExistence', element.attrib['type']) 446 447 def _parse_evidence(element): 448 for k, v in element.attrib.items(): 449 ann_key = k 450 append_to_annotations(ann_key, v) 451 452 def _parse_sequence(element): 453 for k, v in element.attrib.items(): 454 if k in ("length", "mass", "version"): 455 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v) 456 else: 457 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v 458 seq = ''.join((element.text.split())) 459 self.ParsedSeqRecord.seq = Seq.Seq(seq, self.alphabet) 460 461 #============================================# 462 #Initialize SeqRecord 463 self.ParsedSeqRecord = SeqRecord('', id='') 464 465 #Entry attribs parsing 466 #Unknown dataset should not happen! 467 self.dbname = self.entry.attrib.get('dataset', 'UnknownDataset') 468 #add attribs to annotations 469 for k, v in self.entry.attrib.items(): 470 if k in ("version"): 471 #original 472 #self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 473 #To cope with swissProt plain text parser. this can cause errors 474 #if the attrib has the same name of an other annotation 475 self.ParsedSeqRecord.annotations[k] = int(v) 476 else: 477 #self.ParsedSeqRecord.annotations["entry_%s" % k] = v 478 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser 479 480 #Top-to-bottom entry children parsing 481 for element in self.entry: 482 if element.tag == NS + 'name': 483 _parse_name(element) 484 elif element.tag == NS + 'accession': 485 _parse_accession(element) 486 elif element.tag == NS + 'protein': 487 _parse_protein(element) 488 elif element.tag == NS + 'gene': 489 _parse_gene(element) 490 elif element.tag == NS + 'geneLocation': 491 _parse_geneLocation(element) 492 elif element.tag == NS + 'organism': 493 _parse_organism(element) 494 elif element.tag == NS + 'organismHost': 495 _parse_organismHost(element) 496 elif element.tag == NS + 'keyword': 497 _parse_keyword(element) 498 elif element.tag == NS + 'comment': 499 _parse_comment(element) 500 elif element.tag == NS + 'dbReference': 501 _parse_dbReference(element) 502 elif element.tag == NS + 'reference': 503 _parse_reference(element) 504 elif element.tag == NS + 'feature': 505 _parse_feature(element) 506 elif element.tag == NS + 'proteinExistence': 507 _parse_proteinExistence(element) 508 elif element.tag == NS + 'evidence': 509 _parse_evidence(element) 510 elif element.tag == NS + 'sequence': 511 _parse_sequence(element) 512 else: 513 pass 514 515 #remove duplicate dbxrefs 516 self.ParsedSeqRecord.dbxrefs = sorted(list(set(self.ParsedSeqRecord.dbxrefs))) 517 518 # use first accession as id 519 if not self.ParsedSeqRecord.id: 520 self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations['accessions'][0] 521 522 return self.ParsedSeqRecord 523