Package Bio :: Package SeqIO :: Module UniprotIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.UniprotIO

  1  # Copyright 2010 by Andrea Pierleoni 
  2  # Revisions copyright 2010, 2016 by Peter Cock 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "uniprot-xml" file format. 
 10   
 11  See Also: 
 12  http://www.uniprot.org 
 13   
 14  The UniProt XML format essentially replaces the old plain text file format 
 15  originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 
 16   
 17  """ 
 18  import sys 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqFeature 
 22  from Bio import Alphabet 
 23  from Bio.SeqRecord import SeqRecord 
 24  from Bio._py3k import StringIO 
 25   
 26   
 27  # For speed try to use cElementTree rather than ElementTree 
 28  try: 
 29      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 30          # Workaround for bug in python 3.0 and 3.1, 
 31          # see http://bugs.python.org/issue9257 
 32          from xml.etree import ElementTree as ElementTree 
 33      else: 
 34          from xml.etree import cElementTree as ElementTree 
 35  except ImportError: 
 36      from xml.etree import ElementTree as ElementTree 
 37   
 38  NS = "{http://uniprot.org/uniprot}" 
 39  REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 
 40   
 41   
42 -def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
43 """Iterate over UniProt XML as SeqRecord objects. 44 45 parses an XML entry at a time from any UniProt XML file 46 returns a SeqRecord for each iteration 47 48 This generator can be used in Bio.SeqIO 49 50 return_raw_comments = True --> comment fields are returned as complete XML to allow further processing 51 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 52 """ 53 if isinstance(alphabet, Alphabet.NucleotideAlphabet): 54 raise ValueError("Wrong alphabet %r" % alphabet) 55 if isinstance(alphabet, Alphabet.Gapped): 56 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): 57 raise ValueError("Wrong alphabet %r" % alphabet) 58 59 if not hasattr(handle, "read"): 60 if isinstance(handle, str): 61 import warnings 62 from Bio import BiopythonDeprecationWarning 63 warnings.warn("Passing an XML-containing handle is recommended", 64 BiopythonDeprecationWarning) 65 handle = StringIO(handle) 66 else: 67 raise TypeError("Requires an XML-containing handle" 68 " (or XML as a string, but that's deprectaed)") 69 70 if ElementTree is None: 71 from Bio import MissingExternalDependencyError 72 raise MissingExternalDependencyError( 73 "No ElementTree module was found. " 74 "Use Python 2.5+, lxml or elementtree if you " 75 "want to use Bio.SeqIO.UniprotIO.") 76 77 for event, elem in ElementTree.iterparse(handle, events=("start", "end")): 78 if event == "end" and elem.tag == NS + "entry": 79 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() 80 elem.clear()
81 82
83 -class Parser(object):
84 """Parse a UniProt XML entry to a SeqRecord. 85 86 return_raw_comments=True to get back the complete comment field in XML format 87 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet. 88 """ 89
90 - def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
91 """Initialize the class.""" 92 self.entry = elem 93 self.alphabet = alphabet 94 self.return_raw_comments = return_raw_comments
95
96 - def parse(self):
97 """Parse the input.""" 98 assert self.entry.tag == NS + 'entry' 99 100 def append_to_annotations(key, value): 101 if key not in self.ParsedSeqRecord.annotations: 102 self.ParsedSeqRecord.annotations[key] = [] 103 if value not in self.ParsedSeqRecord.annotations[key]: 104 self.ParsedSeqRecord.annotations[key].append(value)
105 106 def _parse_name(element): 107 self.ParsedSeqRecord.name = element.text 108 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)
109 110 def _parse_accession(element): 111 append_to_annotations('accessions', element.text) # to cope with SwissProt plain text parser 112 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text) 113 114 def _parse_protein(element): 115 """Parse protein names (PRIVATE).""" 116 descr_set = False 117 for protein_element in element: 118 if protein_element.tag in [NS + 'recommendedName', NS + 'submittedName', NS + 'alternativeName']: # recommendedName tag are parsed before 119 # use protein fields for name and description 120 for rec_name in protein_element: 121 ann_key = '%s_%s' % (protein_element.tag.replace(NS, ''), 122 rec_name.tag.replace(NS, '')) 123 append_to_annotations(ann_key, rec_name.text) 124 if (rec_name.tag == NS + 'fullName') and not descr_set: 125 self.ParsedSeqRecord.description = rec_name.text 126 descr_set = True 127 elif protein_element.tag == NS + 'component': 128 pass # not parsed 129 elif protein_element.tag == NS + 'domain': 130 pass # not parsed 131 132 def _parse_gene(element): 133 for genename_element in element: 134 if 'type' in genename_element.attrib: 135 ann_key = 'gene_%s_%s' % (genename_element.tag.replace(NS, ''), 136 genename_element.attrib['type']) 137 if genename_element.attrib['type'] == 'primary': 138 self.ParsedSeqRecord.annotations[ann_key] = genename_element.text 139 else: 140 append_to_annotations(ann_key, genename_element.text) 141 142 def _parse_geneLocation(element): 143 append_to_annotations('geneLocation', element.attrib['type']) 144 145 def _parse_organism(element): 146 organism_name = com_name = sci_name = '' 147 for organism_element in element: 148 if organism_element.tag == NS + 'name': 149 if organism_element.text: 150 if organism_element.attrib['type'] == 'scientific': 151 sci_name = organism_element.text 152 elif organism_element.attrib['type'] == 'common': 153 com_name = organism_element.text 154 else: 155 # e.g. synonym 156 append_to_annotations("organism_name", organism_element.text) 157 elif organism_element.tag == NS + 'dbReference': 158 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type'] + ':' + organism_element.attrib['id']) 159 elif organism_element.tag == NS + 'lineage': 160 for taxon_element in organism_element: 161 if taxon_element.tag == NS + 'taxon': 162 append_to_annotations('taxonomy', taxon_element.text) 163 if sci_name and com_name: 164 organism_name = '%s (%s)' % (sci_name, com_name) 165 elif sci_name: 166 organism_name = sci_name 167 elif com_name: 168 organism_name = com_name 169 self.ParsedSeqRecord.annotations['organism'] = organism_name 170 171 def _parse_organismHost(element): 172 for organism_element in element: 173 if organism_element.tag == NS + 'name': 174 append_to_annotations("organism_host", organism_element.text) 175 176 def _parse_keyword(element): 177 append_to_annotations('keywords', element.text) 178 179 def _parse_comment(element): 180 """Parse comments (PRIVATE). 181 182 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 183 To store all the contained data, more complex data structures are needed, such as 184 annotated dictionaries. This is left to end user, by optionally setting: 185 186 return_raw_comments=True 187 188 The original XML is returned in the annotation fields. 189 190 Available comment types at december 2009: 191 - "allergen" 192 - "alternative products" 193 - "biotechnology" 194 - "biophysicochemical properties" 195 - "catalytic activity" 196 - "caution" 197 - "cofactor" 198 - "developmental stage" 199 - "disease" 200 - "domain" 201 - "disruption phenotype" 202 - "enzyme regulation" 203 - "function" 204 - "induction" 205 - "miscellaneous" 206 - "pathway" 207 - "pharmaceutical" 208 - "polymorphism" 209 - "PTM" 210 - "RNA editing" 211 - "similarity" 212 - "subcellular location" 213 - "sequence caution" 214 - "subunit" 215 - "tissue specificity" 216 - "toxic dose" 217 - "online information" 218 - "mass spectrometry" 219 - "interaction" 220 221 """ 222 simple_comments = ["allergen", 223 "biotechnology", 224 "biophysicochemical properties", 225 "catalytic activity", 226 "caution", 227 "cofactor", 228 "developmental stage", 229 "disease", 230 "domain", 231 "disruption phenotype", 232 "enzyme regulation", 233 "function", 234 "induction", 235 "miscellaneous", 236 "pathway", 237 "pharmaceutical", 238 "polymorphism", 239 "PTM", 240 "RNA editing", # positions not parsed 241 "similarity", 242 "subunit", 243 "tissue specificity", 244 "toxic dose", 245 ] 246 247 if element.attrib['type'] in simple_comments: 248 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 249 for text_element in element.getiterator(NS + 'text'): 250 if text_element.text: 251 append_to_annotations(ann_key, text_element.text) 252 elif element.attrib['type'] == 'subcellular location': 253 for subloc_element in element.getiterator(NS + 'subcellularLocation'): 254 for el in subloc_element: 255 if el.text: 256 ann_key = 'comment_%s_%s' % (element.attrib['type'].replace(' ', ''), el.tag.replace(NS, '')) 257 append_to_annotations(ann_key, el.text) 258 elif element.attrib['type'] == 'interaction': 259 for interact_element in element.getiterator(NS + 'interactant'): 260 ann_key = 'comment_%s_intactId' % element.attrib['type'] 261 append_to_annotations(ann_key, interact_element.attrib['intactId']) 262 elif element.attrib['type'] == 'alternative products': 263 for alt_element in element.getiterator(NS + 'isoform'): 264 ann_key = 'comment_%s_isoform' % element.attrib['type'].replace(' ', '') 265 for id_element in alt_element.getiterator(NS + 'id'): 266 append_to_annotations(ann_key, id_element.text) 267 elif element.attrib['type'] == 'mass spectrometry': 268 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 269 start = end = 0 270 for loc_element in element.getiterator(NS + 'location'): 271 pos_els = loc_element.getiterator(NS + 'position') 272 pos_els = list(pos_els) 273 # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry 274 try: 275 if pos_els: 276 end = int(pos_els[0].attrib['position']) 277 start = end - 1 278 else: 279 start = int(list(loc_element.getiterator(NS + 'begin'))[0].attrib['position']) - 1 280 end = int(list(loc_element.getiterator(NS + 'end'))[0].attrib['position']) 281 except (ValueError, KeyError): # undefined positions or erroneously mapped 282 pass 283 mass = element.attrib['mass'] 284 method = element.attrib['method'] 285 if start == end == 0: 286 append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method)) 287 else: 288 append_to_annotations(ann_key, '%s..%s:%s|%s' % (start, end, mass, method)) 289 elif element.attrib['type'] == 'sequence caution': 290 pass # not parsed: few information, complex structure 291 elif element.attrib['type'] == 'online information': 292 for link_element in element.getiterator(NS + 'link'): 293 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 294 for id_element in link_element.getiterator(NS + 'link'): 295 append_to_annotations(ann_key, 296 '%s@%s' % (element.attrib['name'], link_element.attrib['uri'])) 297 298 # return raw XML comments if needed 299 if self.return_raw_comments: 300 ann_key = 'comment_%s_xml' % element.attrib['type'].replace(' ', '') 301 append_to_annotations(ann_key, ElementTree.tostring(element)) 302 303 def _parse_dbReference(element): 304 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id']) 305 # e.g. 306 # <dbReference type="PDB" key="11" id="2GEZ"> 307 # <property value="X-ray" type="method"/> 308 # <property value="2.60 A" type="resolution"/> 309 # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 310 # </dbReference> 311 if 'type' in element.attrib: 312 if element.attrib['type'] == 'PDB': 313 method = "" 314 resolution = "" 315 for ref_element in element: 316 if ref_element.tag == NS + 'property': 317 dat_type = ref_element.attrib['type'] 318 if dat_type == 'method': 319 method = ref_element.attrib['value'] 320 if dat_type == 'resolution': 321 resolution = ref_element.attrib['value'] 322 if dat_type == 'chains': 323 pairs = ref_element.attrib['value'].split(',') 324 for elem in pairs: 325 pair = elem.strip().split('=') 326 if pair[1] != '-': 327 # TODO - How best to store these, do SeqFeatures make sense? 328 feature = SeqFeature.SeqFeature() 329 feature.type = element.attrib['type'] 330 feature.qualifiers['name'] = element.attrib['id'] 331 feature.qualifiers['method'] = method 332 feature.qualifiers['resolution'] = resolution 333 feature.qualifiers['chains'] = pair[0].split('/') 334 start = int(pair[1].split('-')[0]) - 1 335 end = int(pair[1].split('-')[1]) 336 feature.location = SeqFeature.FeatureLocation(start, end) 337 # self.ParsedSeqRecord.features.append(feature) 338 339 for ref_element in element: 340 if ref_element.tag == NS + 'property': 341 pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 342 343 def _parse_reference(element): 344 reference = SeqFeature.Reference() 345 authors = [] 346 scopes = [] 347 tissues = [] 348 journal_name = '' 349 pub_type = '' 350 pub_date = '' 351 for ref_element in element: 352 if ref_element.tag == NS + 'citation': 353 pub_type = ref_element.attrib['type'] 354 if pub_type == 'submission': 355 pub_type += ' to the ' + ref_element.attrib['db'] 356 if 'name' in ref_element.attrib: 357 journal_name = ref_element.attrib['name'] 358 pub_date = ref_element.attrib.get('date', '') 359 j_volume = ref_element.attrib.get('volume', '') 360 j_first = ref_element.attrib.get('first', '') 361 j_last = ref_element.attrib.get('last', '') 362 for cit_element in ref_element: 363 if cit_element.tag == NS + 'title': 364 reference.title = cit_element.text 365 elif cit_element.tag == NS + 'authorList': 366 for person_element in cit_element: 367 authors.append(person_element.attrib['name']) 368 elif cit_element.tag == NS + 'dbReference': 369 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] + 370 ':' + cit_element.attrib['id']) 371 if cit_element.attrib['type'] == 'PubMed': 372 reference.pubmed_id = cit_element.attrib['id'] 373 elif ref_element.attrib['type'] == 'MEDLINE': 374 reference.medline_id = cit_element.attrib['id'] 375 elif ref_element.tag == NS + 'scope': 376 scopes.append(ref_element.text) 377 elif ref_element.tag == NS + 'source': 378 for source_element in ref_element: 379 if source_element.tag == NS + 'tissue': 380 tissues.append(source_element.text) 381 if scopes: 382 scopes_str = 'Scope: ' + ', '.join(scopes) 383 else: 384 scopes_str = '' 385 if tissues: 386 tissues_str = 'Tissue: ' + ', '.join(tissues) 387 else: 388 tissues_str = '' 389 390 # locations cannot be parsed since they are actually written in 391 # free text inside scopes so all the references are put in the 392 # annotation. 393 reference.location = [] 394 reference.authors = ', '.join(authors) 395 if journal_name: 396 if pub_date and j_volume and j_first and j_last: 397 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, 398 volume=j_volume, 399 first=j_first, 400 last=j_last, 401 pub_date=pub_date) 402 else: 403 reference.journal = journal_name 404 reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) 405 append_to_annotations('references', reference) 406 407 def _parse_position(element, offset=0): 408 try: 409 position = int(element.attrib['position']) + offset 410 except KeyError as err: 411 position = None 412 status = element.attrib.get('status', '') 413 if status == 'unknown': 414 assert position is None 415 return SeqFeature.UnknownPosition() 416 elif not status: 417 return SeqFeature.ExactPosition(position) 418 elif status == 'greater than': 419 return SeqFeature.AfterPosition(position) 420 elif status == 'less than': 421 return SeqFeature.BeforePosition(position) 422 elif status == 'uncertain': 423 return SeqFeature.UncertainPosition(position) 424 else: 425 raise NotImplementedError("Position status %r" % status) 426 427 def _parse_feature(element): 428 feature = SeqFeature.SeqFeature() 429 for k, v in element.attrib.items(): 430 feature.qualifiers[k] = v 431 feature.type = element.attrib.get('type', '') 432 if 'id' in element.attrib: 433 feature.id = element.attrib['id'] 434 for feature_element in element: 435 if feature_element.tag == NS + 'location': 436 position_elements = feature_element.findall(NS + 'position') 437 if position_elements: 438 element = position_elements[0] 439 start_position = _parse_position(element, -1) 440 end_position = _parse_position(element) 441 else: 442 element = feature_element.findall(NS + 'begin')[0] 443 start_position = _parse_position(element, -1) 444 element = feature_element.findall(NS + 'end')[0] 445 end_position = _parse_position(element) 446 feature.location = SeqFeature.FeatureLocation(start_position, end_position) 447 else: 448 try: 449 feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text 450 except Exception: # TODO - Which exceptions? 451 pass # skip unparsable tag 452 self.ParsedSeqRecord.features.append(feature) 453 454 def _parse_proteinExistence(element): 455 append_to_annotations('proteinExistence', element.attrib['type']) 456 457 def _parse_evidence(element): 458 for k, v in element.attrib.items(): 459 ann_key = k 460 append_to_annotations(ann_key, v) 461 462 def _parse_sequence(element): 463 for k, v in element.attrib.items(): 464 if k in ("length", "mass", "version"): 465 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v) 466 else: 467 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v 468 seq = ''.join((element.text.split())) 469 self.ParsedSeqRecord.seq = Seq.Seq(seq, self.alphabet) 470 471 # ============================================# 472 # Initialize SeqRecord 473 self.ParsedSeqRecord = SeqRecord('', id='') 474 475 # Entry attribs parsing 476 # Unknown dataset should not happen! 477 self.dbname = self.entry.attrib.get('dataset', 'UnknownDataset') 478 # add attribs to annotations 479 for k, v in self.entry.attrib.items(): 480 if k in ("version"): 481 # original 482 # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 483 # To cope with swissProt plain text parser. this can cause errors 484 # if the attrib has the same name of an other annotation 485 self.ParsedSeqRecord.annotations[k] = int(v) 486 else: 487 # self.ParsedSeqRecord.annotations["entry_%s" % k] = v 488 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser 489 490 # Top-to-bottom entry children parsing 491 for element in self.entry: 492 if element.tag == NS + 'name': 493 _parse_name(element) 494 elif element.tag == NS + 'accession': 495 _parse_accession(element) 496 elif element.tag == NS + 'protein': 497 _parse_protein(element) 498 elif element.tag == NS + 'gene': 499 _parse_gene(element) 500 elif element.tag == NS + 'geneLocation': 501 _parse_geneLocation(element) 502 elif element.tag == NS + 'organism': 503 _parse_organism(element) 504 elif element.tag == NS + 'organismHost': 505 _parse_organismHost(element) 506 elif element.tag == NS + 'keyword': 507 _parse_keyword(element) 508 elif element.tag == NS + 'comment': 509 _parse_comment(element) 510 elif element.tag == NS + 'dbReference': 511 _parse_dbReference(element) 512 elif element.tag == NS + 'reference': 513 _parse_reference(element) 514 elif element.tag == NS + 'feature': 515 _parse_feature(element) 516 elif element.tag == NS + 'proteinExistence': 517 _parse_proteinExistence(element) 518 elif element.tag == NS + 'evidence': 519 _parse_evidence(element) 520 elif element.tag == NS + 'sequence': 521 _parse_sequence(element) 522 else: 523 pass 524 525 # remove duplicate dbxrefs 526 self.ParsedSeqRecord.dbxrefs = sorted(list(set(self.ParsedSeqRecord.dbxrefs))) 527 528 # use first accession as id 529 if not self.ParsedSeqRecord.id: 530 self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations['accessions'][0] 531 532 return self.ParsedSeqRecord 533