Package Bio :: Package SeqIO :: Module UniprotIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.UniprotIO

  1  # Copyright 2010 by Andrea Pierleoni 
  2  # Revisions copyright 2010, 2016 by Peter Cock 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "uniprot-xml" file format. 
 10   
 11  See Also: 
 12  http://www.uniprot.org 
 13   
 14  The UniProt XML format essentially replaces the old plain text file format 
 15  originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 
 16   
 17  """ 
 18  import sys 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqFeature 
 22  from Bio import Alphabet 
 23  from Bio.SeqRecord import SeqRecord 
 24  from Bio._py3k import StringIO 
 25   
 26   
 27  # For speed try to use cElementTree rather than ElementTree 
 28  try: 
 29      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 30          # Workaround for bug in python 3.0 and 3.1, 
 31          # see http://bugs.python.org/issue9257 
 32          from xml.etree import ElementTree as ElementTree 
 33      else: 
 34          from xml.etree import cElementTree as ElementTree 
 35  except ImportError: 
 36      from xml.etree import ElementTree as ElementTree 
 37   
 38  NS = "{http://uniprot.org/uniprot}" 
 39  REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 
 40   
 41   
42 -def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
43 """Generator function to parse UniProt XML as SeqRecord objects. 44 45 parses an XML entry at a time from any UniProt XML file 46 returns a SeqRecord for each iteration 47 48 This generator can be used in Bio.SeqIO 49 50 return_raw_comments = True --> comment fields are returned as complete XML to allow further processing 51 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 52 """ 53 if isinstance(alphabet, Alphabet.NucleotideAlphabet): 54 raise ValueError("Wrong alphabet %r" % alphabet) 55 if isinstance(alphabet, Alphabet.Gapped): 56 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): 57 raise ValueError("Wrong alphabet %r" % alphabet) 58 59 if not hasattr(handle, "read"): 60 if isinstance(handle, str): 61 import warnings 62 from Bio import BiopythonDeprecationWarning 63 warnings.warn("Passing an XML-containing handle is recommended", 64 BiopythonDeprecationWarning) 65 handle = StringIO(handle) 66 else: 67 raise TypeError("Requires an XML-containing handle" 68 " (or XML as a string, but that's deprectaed)") 69 70 if ElementTree is None: 71 from Bio import MissingExternalDependencyError 72 raise MissingExternalDependencyError( 73 "No ElementTree module was found. " 74 "Use Python 2.5+, lxml or elementtree if you " 75 "want to use Bio.SeqIO.UniprotIO.") 76 77 for event, elem in ElementTree.iterparse(handle, events=("start", "end")): 78 if event == "end" and elem.tag == NS + "entry": 79 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() 80 elem.clear()
81 82
83 -class Parser(object):
84 """Parse a UniProt XML entry to a SeqRecord. 85 86 return_raw_comments=True to get back the complete comment field in XML format 87 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet. 88 """ 89
90 - def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
91 self.entry = elem 92 self.alphabet = alphabet 93 self.return_raw_comments = return_raw_comments
94
95 - def parse(self):
96 """Parse the input.""" 97 assert self.entry.tag == NS + 'entry' 98 99 def append_to_annotations(key, value): 100 if key not in self.ParsedSeqRecord.annotations: 101 self.ParsedSeqRecord.annotations[key] = [] 102 if value not in self.ParsedSeqRecord.annotations[key]: 103 self.ParsedSeqRecord.annotations[key].append(value)
104 105 def _parse_name(element): 106 self.ParsedSeqRecord.name = element.text 107 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)
108 109 def _parse_accession(element): 110 append_to_annotations('accessions', element.text) # to cope with SwissProt plain text parser 111 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text) 112 113 def _parse_protein(element): 114 """Parse protein names (PRIVATE).""" 115 descr_set = False 116 for protein_element in element: 117 if protein_element.tag in [NS + 'recommendedName', NS + 'submittedName', NS + 'alternativeName']: # recommendedName tag are parsed before 118 # use protein fields for name and description 119 for rec_name in protein_element: 120 ann_key = '%s_%s' % (protein_element.tag.replace(NS, ''), 121 rec_name.tag.replace(NS, '')) 122 append_to_annotations(ann_key, rec_name.text) 123 if (rec_name.tag == NS + 'fullName') and not descr_set: 124 self.ParsedSeqRecord.description = rec_name.text 125 descr_set = True 126 elif protein_element.tag == NS + 'component': 127 pass # not parsed 128 elif protein_element.tag == NS + 'domain': 129 pass # not parsed 130 131 def _parse_gene(element): 132 for genename_element in element: 133 if 'type' in genename_element.attrib: 134 ann_key = 'gene_%s_%s' % (genename_element.tag.replace(NS, ''), 135 genename_element.attrib['type']) 136 if genename_element.attrib['type'] == 'primary': 137 self.ParsedSeqRecord.annotations[ann_key] = genename_element.text 138 else: 139 append_to_annotations(ann_key, genename_element.text) 140 141 def _parse_geneLocation(element): 142 append_to_annotations('geneLocation', element.attrib['type']) 143 144 def _parse_organism(element): 145 organism_name = com_name = sci_name = '' 146 for organism_element in element: 147 if organism_element.tag == NS + 'name': 148 if organism_element.text: 149 if organism_element.attrib['type'] == 'scientific': 150 sci_name = organism_element.text 151 elif organism_element.attrib['type'] == 'common': 152 com_name = organism_element.text 153 else: 154 # e.g. synonym 155 append_to_annotations("organism_name", organism_element.text) 156 elif organism_element.tag == NS + 'dbReference': 157 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type'] + ':' + organism_element.attrib['id']) 158 elif organism_element.tag == NS + 'lineage': 159 for taxon_element in organism_element: 160 if taxon_element.tag == NS + 'taxon': 161 append_to_annotations('taxonomy', taxon_element.text) 162 if sci_name and com_name: 163 organism_name = '%s (%s)' % (sci_name, com_name) 164 elif sci_name: 165 organism_name = sci_name 166 elif com_name: 167 organism_name = com_name 168 self.ParsedSeqRecord.annotations['organism'] = organism_name 169 170 def _parse_organismHost(element): 171 for organism_element in element: 172 if organism_element.tag == NS + 'name': 173 append_to_annotations("organism_host", organism_element.text) 174 175 def _parse_keyword(element): 176 append_to_annotations('keywords', element.text) 177 178 def _parse_comment(element): 179 """Parse comments (PRIVATE). 180 181 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 182 To store all the contained data, more complex data structures are needed, such as 183 annotated dictionaries. This is left to end user, by optionally setting: 184 185 return_raw_comments=True 186 187 The original XML is returned in the annotation fields. 188 189 Available comment types at december 2009: 190 - "allergen" 191 - "alternative products" 192 - "biotechnology" 193 - "biophysicochemical properties" 194 - "catalytic activity" 195 - "caution" 196 - "cofactor" 197 - "developmental stage" 198 - "disease" 199 - "domain" 200 - "disruption phenotype" 201 - "enzyme regulation" 202 - "function" 203 - "induction" 204 - "miscellaneous" 205 - "pathway" 206 - "pharmaceutical" 207 - "polymorphism" 208 - "PTM" 209 - "RNA editing" 210 - "similarity" 211 - "subcellular location" 212 - "sequence caution" 213 - "subunit" 214 - "tissue specificity" 215 - "toxic dose" 216 - "online information" 217 - "mass spectrometry" 218 - "interaction" 219 220 """ 221 simple_comments = ["allergen", 222 "biotechnology", 223 "biophysicochemical properties", 224 "catalytic activity", 225 "caution", 226 "cofactor", 227 "developmental stage", 228 "disease", 229 "domain", 230 "disruption phenotype", 231 "enzyme regulation", 232 "function", 233 "induction", 234 "miscellaneous", 235 "pathway", 236 "pharmaceutical", 237 "polymorphism", 238 "PTM", 239 "RNA editing", # positions not parsed 240 "similarity", 241 "subunit", 242 "tissue specificity", 243 "toxic dose", 244 ] 245 246 if element.attrib['type'] in simple_comments: 247 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 248 for text_element in element.getiterator(NS + 'text'): 249 if text_element.text: 250 append_to_annotations(ann_key, text_element.text) 251 elif element.attrib['type'] == 'subcellular location': 252 for subloc_element in element.getiterator(NS + 'subcellularLocation'): 253 for el in subloc_element: 254 if el.text: 255 ann_key = 'comment_%s_%s' % (element.attrib['type'].replace(' ', ''), el.tag.replace(NS, '')) 256 append_to_annotations(ann_key, el.text) 257 elif element.attrib['type'] == 'interaction': 258 for interact_element in element.getiterator(NS + 'interactant'): 259 ann_key = 'comment_%s_intactId' % element.attrib['type'] 260 append_to_annotations(ann_key, interact_element.attrib['intactId']) 261 elif element.attrib['type'] == 'alternative products': 262 for alt_element in element.getiterator(NS + 'isoform'): 263 ann_key = 'comment_%s_isoform' % element.attrib['type'].replace(' ', '') 264 for id_element in alt_element.getiterator(NS + 'id'): 265 append_to_annotations(ann_key, id_element.text) 266 elif element.attrib['type'] == 'mass spectrometry': 267 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 268 start = end = 0 269 for loc_element in element.getiterator(NS + 'location'): 270 pos_els = loc_element.getiterator(NS + 'position') 271 pos_els = list(pos_els) 272 # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry 273 try: 274 if pos_els: 275 end = int(pos_els[0].attrib['position']) 276 start = end - 1 277 else: 278 start = int(list(loc_element.getiterator(NS + 'begin'))[0].attrib['position']) - 1 279 end = int(list(loc_element.getiterator(NS + 'end'))[0].attrib['position']) 280 except (ValueError, KeyError): # undefined positions or erroneously mapped 281 pass 282 mass = element.attrib['mass'] 283 method = element.attrib['method'] 284 if start == end == 0: 285 append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method)) 286 else: 287 append_to_annotations(ann_key, '%s..%s:%s|%s' % (start, end, mass, method)) 288 elif element.attrib['type'] == 'sequence caution': 289 pass # not parsed: few information, complex structure 290 elif element.attrib['type'] == 'online information': 291 for link_element in element.getiterator(NS + 'link'): 292 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 293 for id_element in link_element.getiterator(NS + 'link'): 294 append_to_annotations(ann_key, 295 '%s@%s' % (element.attrib['name'], link_element.attrib['uri'])) 296 297 # return raw XML comments if needed 298 if self.return_raw_comments: 299 ann_key = 'comment_%s_xml' % element.attrib['type'].replace(' ', '') 300 append_to_annotations(ann_key, ElementTree.tostring(element)) 301 302 def _parse_dbReference(element): 303 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id']) 304 # e.g. 305 # <dbReference type="PDB" key="11" id="2GEZ"> 306 # <property value="X-ray" type="method"/> 307 # <property value="2.60 A" type="resolution"/> 308 # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 309 # </dbReference> 310 if 'type' in element.attrib: 311 if element.attrib['type'] == 'PDB': 312 method = "" 313 resolution = "" 314 for ref_element in element: 315 if ref_element.tag == NS + 'property': 316 dat_type = ref_element.attrib['type'] 317 if dat_type == 'method': 318 method = ref_element.attrib['value'] 319 if dat_type == 'resolution': 320 resolution = ref_element.attrib['value'] 321 if dat_type == 'chains': 322 pairs = ref_element.attrib['value'].split(',') 323 for elem in pairs: 324 pair = elem.strip().split('=') 325 if pair[1] != '-': 326 # TODO - How best to store these, do SeqFeatures make sense? 327 feature = SeqFeature.SeqFeature() 328 feature.type = element.attrib['type'] 329 feature.qualifiers['name'] = element.attrib['id'] 330 feature.qualifiers['method'] = method 331 feature.qualifiers['resolution'] = resolution 332 feature.qualifiers['chains'] = pair[0].split('/') 333 start = int(pair[1].split('-')[0]) - 1 334 end = int(pair[1].split('-')[1]) 335 feature.location = SeqFeature.FeatureLocation(start, end) 336 # self.ParsedSeqRecord.features.append(feature) 337 338 for ref_element in element: 339 if ref_element.tag == NS + 'property': 340 pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 341 342 def _parse_reference(element): 343 reference = SeqFeature.Reference() 344 authors = [] 345 scopes = [] 346 tissues = [] 347 journal_name = '' 348 pub_type = '' 349 pub_date = '' 350 for ref_element in element: 351 if ref_element.tag == NS + 'citation': 352 pub_type = ref_element.attrib['type'] 353 if pub_type == 'submission': 354 pub_type += ' to the ' + ref_element.attrib['db'] 355 if 'name' in ref_element.attrib: 356 journal_name = ref_element.attrib['name'] 357 pub_date = ref_element.attrib.get('date', '') 358 j_volume = ref_element.attrib.get('volume', '') 359 j_first = ref_element.attrib.get('first', '') 360 j_last = ref_element.attrib.get('last', '') 361 for cit_element in ref_element: 362 if cit_element.tag == NS + 'title': 363 reference.title = cit_element.text 364 elif cit_element.tag == NS + 'authorList': 365 for person_element in cit_element: 366 authors.append(person_element.attrib['name']) 367 elif cit_element.tag == NS + 'dbReference': 368 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] + 369 ':' + cit_element.attrib['id']) 370 if cit_element.attrib['type'] == 'PubMed': 371 reference.pubmed_id = cit_element.attrib['id'] 372 elif ref_element.attrib['type'] == 'MEDLINE': 373 reference.medline_id = cit_element.attrib['id'] 374 elif ref_element.tag == NS + 'scope': 375 scopes.append(ref_element.text) 376 elif ref_element.tag == NS + 'source': 377 for source_element in ref_element: 378 if source_element.tag == NS + 'tissue': 379 tissues.append(source_element.text) 380 if scopes: 381 scopes_str = 'Scope: ' + ', '.join(scopes) 382 else: 383 scopes_str = '' 384 if tissues: 385 tissues_str = 'Tissue: ' + ', '.join(tissues) 386 else: 387 tissues_str = '' 388 389 # locations cannot be parsed since they are actually written in 390 # free text inside scopes so all the references are put in the 391 # annotation. 392 reference.location = [] 393 reference.authors = ', '.join(authors) 394 if journal_name: 395 if pub_date and j_volume and j_first and j_last: 396 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, 397 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) 398 else: 399 reference.journal = journal_name 400 reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) 401 append_to_annotations('references', reference) 402 403 def _parse_position(element, offset=0): 404 try: 405 position = int(element.attrib['position']) + offset 406 except KeyError as err: 407 position = None 408 status = element.attrib.get('status', '') 409 if status == 'unknown': 410 assert position is None 411 return SeqFeature.UnknownPosition() 412 elif not status: 413 return SeqFeature.ExactPosition(position) 414 elif status == 'greater than': 415 return SeqFeature.AfterPosition(position) 416 elif status == 'less than': 417 return SeqFeature.BeforePosition(position) 418 elif status == 'uncertain': 419 return SeqFeature.UncertainPosition(position) 420 else: 421 raise NotImplementedError("Position status %r" % status) 422 423 def _parse_feature(element): 424 feature = SeqFeature.SeqFeature() 425 for k, v in element.attrib.items(): 426 feature.qualifiers[k] = v 427 feature.type = element.attrib.get('type', '') 428 if 'id' in element.attrib: 429 feature.id = element.attrib['id'] 430 for feature_element in element: 431 if feature_element.tag == NS + 'location': 432 position_elements = feature_element.findall(NS + 'position') 433 if position_elements: 434 element = position_elements[0] 435 start_position = _parse_position(element, -1) 436 end_position = _parse_position(element) 437 else: 438 element = feature_element.findall(NS + 'begin')[0] 439 start_position = _parse_position(element, -1) 440 element = feature_element.findall(NS + 'end')[0] 441 end_position = _parse_position(element) 442 feature.location = SeqFeature.FeatureLocation(start_position, end_position) 443 else: 444 try: 445 feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text 446 except Exception: # TODO - Which exceptions? 447 pass # skip unparsable tag 448 self.ParsedSeqRecord.features.append(feature) 449 450 def _parse_proteinExistence(element): 451 append_to_annotations('proteinExistence', element.attrib['type']) 452 453 def _parse_evidence(element): 454 for k, v in element.attrib.items(): 455 ann_key = k 456 append_to_annotations(ann_key, v) 457 458 def _parse_sequence(element): 459 for k, v in element.attrib.items(): 460 if k in ("length", "mass", "version"): 461 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v) 462 else: 463 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v 464 seq = ''.join((element.text.split())) 465 self.ParsedSeqRecord.seq = Seq.Seq(seq, self.alphabet) 466 467 # ============================================# 468 # Initialize SeqRecord 469 self.ParsedSeqRecord = SeqRecord('', id='') 470 471 # Entry attribs parsing 472 # Unknown dataset should not happen! 473 self.dbname = self.entry.attrib.get('dataset', 'UnknownDataset') 474 # add attribs to annotations 475 for k, v in self.entry.attrib.items(): 476 if k in ("version"): 477 # original 478 # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 479 # To cope with swissProt plain text parser. this can cause errors 480 # if the attrib has the same name of an other annotation 481 self.ParsedSeqRecord.annotations[k] = int(v) 482 else: 483 # self.ParsedSeqRecord.annotations["entry_%s" % k] = v 484 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser 485 486 # Top-to-bottom entry children parsing 487 for element in self.entry: 488 if element.tag == NS + 'name': 489 _parse_name(element) 490 elif element.tag == NS + 'accession': 491 _parse_accession(element) 492 elif element.tag == NS + 'protein': 493 _parse_protein(element) 494 elif element.tag == NS + 'gene': 495 _parse_gene(element) 496 elif element.tag == NS + 'geneLocation': 497 _parse_geneLocation(element) 498 elif element.tag == NS + 'organism': 499 _parse_organism(element) 500 elif element.tag == NS + 'organismHost': 501 _parse_organismHost(element) 502 elif element.tag == NS + 'keyword': 503 _parse_keyword(element) 504 elif element.tag == NS + 'comment': 505 _parse_comment(element) 506 elif element.tag == NS + 'dbReference': 507 _parse_dbReference(element) 508 elif element.tag == NS + 'reference': 509 _parse_reference(element) 510 elif element.tag == NS + 'feature': 511 _parse_feature(element) 512 elif element.tag == NS + 'proteinExistence': 513 _parse_proteinExistence(element) 514 elif element.tag == NS + 'evidence': 515 _parse_evidence(element) 516 elif element.tag == NS + 'sequence': 517 _parse_sequence(element) 518 else: 519 pass 520 521 # remove duplicate dbxrefs 522 self.ParsedSeqRecord.dbxrefs = sorted(list(set(self.ParsedSeqRecord.dbxrefs))) 523 524 # use first accession as id 525 if not self.ParsedSeqRecord.id: 526 self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations['accessions'][0] 527 528 return self.ParsedSeqRecord 529