Package Bio :: Package SeqIO :: Module UniprotIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.UniprotIO

  1  # Copyright 2010 by Andrea Pierleoni 
  2  # Revisions copyright 2010, 2016 by Peter Cock 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "uniprot-xml" file format. 
 10   
 11  See also: 
 12   
 13  http://www.uniprot.org 
 14   
 15  The UniProt XML format essentially replaces the old plain text file format 
 16  originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 
 17  """ 
 18  import sys 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqFeature 
 22  from Bio import Alphabet 
 23  from Bio.SeqRecord import SeqRecord 
 24  from Bio._py3k import StringIO 
 25   
 26   
 27  # For speed try to use cElementTree rather than ElementTree 
 28  try: 
 29      if (3, 0) <= sys.version_info[:2] <= (3, 1): 
 30          # Workaround for bug in python 3.0 and 3.1, 
 31          # see http://bugs.python.org/issue9257 
 32          from xml.etree import ElementTree as ElementTree 
 33      else: 
 34          from xml.etree import cElementTree as ElementTree 
 35  except ImportError: 
 36      from xml.etree import ElementTree as ElementTree 
 37   
 38  NS = "{http://uniprot.org/uniprot}" 
 39  REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 
 40   
 41   
42 -def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
43 """Generator function to parse UniProt XML as SeqRecord objects. 44 45 parses an XML entry at a time from any UniProt XML file 46 returns a SeqRecord for each iteration 47 48 This generator can be used in Bio.SeqIO 49 50 return_raw_comments = True --> comment fields are returned as complete XML to allow further processing 51 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 52 """ 53 if isinstance(alphabet, Alphabet.NucleotideAlphabet): 54 raise ValueError("Wrong alphabet %r" % alphabet) 55 if isinstance(alphabet, Alphabet.Gapped): 56 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): 57 raise ValueError("Wrong alphabet %r" % alphabet) 58 59 if not hasattr(handle, "read"): 60 if isinstance(handle, str): 61 import warnings 62 from Bio import BiopythonDeprecationWarning 63 warnings.warn("Passing an XML-containing handle is recommended", 64 BiopythonDeprecationWarning) 65 handle = StringIO(handle) 66 else: 67 raise TypeError("Requires an XML-containing handle" 68 " (or XML as a string, but that's deprectaed)") 69 70 if ElementTree is None: 71 from Bio import MissingExternalDependencyError 72 raise MissingExternalDependencyError( 73 "No ElementTree module was found. " 74 "Use Python 2.5+, lxml or elementtree if you " 75 "want to use Bio.SeqIO.UniprotIO.") 76 77 for event, elem in ElementTree.iterparse(handle, events=("start", "end")): 78 if event == "end" and elem.tag == NS + "entry": 79 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() 80 elem.clear()
81 82
83 -class Parser(object):
84 """Parse a UniProt XML entry to a SeqRecord. 85 86 return_raw_comments=True to get back the complete comment field in XML format 87 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet. 88 """
89 - def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
90 self.entry = elem 91 self.alphabet = alphabet 92 self.return_raw_comments = return_raw_comments
93
94 - def parse(self):
95 """Parse the input.""" 96 assert self.entry.tag == NS + 'entry' 97 98 def append_to_annotations(key, value): 99 if key not in self.ParsedSeqRecord.annotations: 100 self.ParsedSeqRecord.annotations[key] = [] 101 if value not in self.ParsedSeqRecord.annotations[key]: 102 self.ParsedSeqRecord.annotations[key].append(value)
103 104 def _parse_name(element): 105 self.ParsedSeqRecord.name = element.text 106 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)
107 108 def _parse_accession(element): 109 append_to_annotations('accessions', element.text) # to cope with SwissProt plain text parser 110 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text) 111 112 def _parse_protein(element): 113 """Parse protein names (PRIVATE).""" 114 descr_set = False 115 for protein_element in element: 116 if protein_element.tag in [NS + 'recommendedName', NS + 'submittedName', NS + 'alternativeName']: # recommendedName tag are parsed before 117 # use protein fields for name and description 118 for rec_name in protein_element: 119 ann_key = '%s_%s' % (protein_element.tag.replace(NS, ''), 120 rec_name.tag.replace(NS, '')) 121 append_to_annotations(ann_key, rec_name.text) 122 if (rec_name.tag == NS + 'fullName') and not descr_set: 123 self.ParsedSeqRecord.description = rec_name.text 124 descr_set = True 125 elif protein_element.tag == NS + 'component': 126 pass # not parsed 127 elif protein_element.tag == NS + 'domain': 128 pass # not parsed 129 130 def _parse_gene(element): 131 for genename_element in element: 132 if 'type' in genename_element.attrib: 133 ann_key = 'gene_%s_%s' % (genename_element.tag.replace(NS, ''), 134 genename_element.attrib['type']) 135 if genename_element.attrib['type'] == 'primary': 136 self.ParsedSeqRecord.annotations[ann_key] = genename_element.text 137 else: 138 append_to_annotations(ann_key, genename_element.text) 139 140 def _parse_geneLocation(element): 141 append_to_annotations('geneLocation', element.attrib['type']) 142 143 def _parse_organism(element): 144 organism_name = com_name = sci_name = '' 145 for organism_element in element: 146 if organism_element.tag == NS + 'name': 147 if organism_element.text: 148 if organism_element.attrib['type'] == 'scientific': 149 sci_name = organism_element.text 150 elif organism_element.attrib['type'] == 'common': 151 com_name = organism_element.text 152 else: 153 # e.g. synonym 154 append_to_annotations("organism_name", organism_element.text) 155 elif organism_element.tag == NS + 'dbReference': 156 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type'] + ':' + organism_element.attrib['id']) 157 elif organism_element.tag == NS + 'lineage': 158 for taxon_element in organism_element: 159 if taxon_element.tag == NS + 'taxon': 160 append_to_annotations('taxonomy', taxon_element.text) 161 if sci_name and com_name: 162 organism_name = '%s (%s)' % (sci_name, com_name) 163 elif sci_name: 164 organism_name = sci_name 165 elif com_name: 166 organism_name = com_name 167 self.ParsedSeqRecord.annotations['organism'] = organism_name 168 169 def _parse_organismHost(element): 170 for organism_element in element: 171 if organism_element.tag == NS + 'name': 172 append_to_annotations("organism_host", organism_element.text) 173 174 def _parse_keyword(element): 175 append_to_annotations('keywords', element.text) 176 177 def _parse_comment(element): 178 """Parse comments (PRIVATE). 179 180 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 181 To store all the contained data, more complex data structures are needed, such as 182 annotated dictionaries. This is left to end user, by optionally setting: 183 184 return_raw_comments=True 185 186 The original XML is returned in the annotation fields. 187 188 Available comment types at december 2009: 189 "allergen" 190 "alternative products" 191 "biotechnology" 192 "biophysicochemical properties" 193 "catalytic activity" 194 "caution" 195 "cofactor" 196 "developmental stage" 197 "disease" 198 "domain" 199 "disruption phenotype" 200 "enzyme regulation" 201 "function" 202 "induction" 203 "miscellaneous" 204 "pathway" 205 "pharmaceutical" 206 "polymorphism" 207 "PTM" 208 "RNA editing" 209 "similarity" 210 "subcellular location" 211 "sequence caution" 212 "subunit" 213 "tissue specificity" 214 "toxic dose" 215 "online information" 216 "mass spectrometry" 217 "interaction" 218 """ 219 simple_comments = ["allergen", 220 "biotechnology", 221 "biophysicochemical properties", 222 "catalytic activity", 223 "caution", 224 "cofactor", 225 "developmental stage", 226 "disease", 227 "domain", 228 "disruption phenotype", 229 "enzyme regulation", 230 "function", 231 "induction", 232 "miscellaneous", 233 "pathway", 234 "pharmaceutical", 235 "polymorphism", 236 "PTM", 237 "RNA editing", # positions not parsed 238 "similarity", 239 "subunit", 240 "tissue specificity", 241 "toxic dose", 242 ] 243 244 if element.attrib['type'] in simple_comments: 245 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 246 for text_element in element.getiterator(NS + 'text'): 247 if text_element.text: 248 append_to_annotations(ann_key, text_element.text) 249 elif element.attrib['type'] == 'subcellular location': 250 for subloc_element in element.getiterator(NS + 'subcellularLocation'): 251 for el in subloc_element: 252 if el.text: 253 ann_key = 'comment_%s_%s' % (element.attrib['type'].replace(' ', ''), el.tag.replace(NS, '')) 254 append_to_annotations(ann_key, el.text) 255 elif element.attrib['type'] == 'interaction': 256 for interact_element in element.getiterator(NS + 'interactant'): 257 ann_key = 'comment_%s_intactId' % element.attrib['type'] 258 append_to_annotations(ann_key, interact_element.attrib['intactId']) 259 elif element.attrib['type'] == 'alternative products': 260 for alt_element in element.getiterator(NS + 'isoform'): 261 ann_key = 'comment_%s_isoform' % element.attrib['type'].replace(' ', '') 262 for id_element in alt_element.getiterator(NS + 'id'): 263 append_to_annotations(ann_key, id_element.text) 264 elif element.attrib['type'] == 'mass spectrometry': 265 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 266 start = end = 0 267 for loc_element in element.getiterator(NS + 'location'): 268 pos_els = loc_element.getiterator(NS + 'position') 269 pos_els = list(pos_els) 270 # this try should be avoided, maybe it is safer to skip position parsing for mass spectrometry 271 try: 272 if pos_els: 273 end = int(pos_els[0].attrib['position']) 274 start = end - 1 275 else: 276 start = int(list(loc_element.getiterator(NS + 'begin'))[0].attrib['position']) - 1 277 end = int(list(loc_element.getiterator(NS + 'end'))[0].attrib['position']) 278 except (ValueError, KeyError): # undefined positions or erroneously mapped 279 pass 280 mass = element.attrib['mass'] 281 method = element.attrib['method'] 282 if start == end == 0: 283 append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method)) 284 else: 285 append_to_annotations(ann_key, '%s..%s:%s|%s' % (start, end, mass, method)) 286 elif element.attrib['type'] == 'sequence caution': 287 pass # not parsed: few information, complex structure 288 elif element.attrib['type'] == 'online information': 289 for link_element in element.getiterator(NS + 'link'): 290 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '') 291 for id_element in link_element.getiterator(NS + 'link'): 292 append_to_annotations(ann_key, 293 '%s@%s' % (element.attrib['name'], link_element.attrib['uri'])) 294 295 # return raw XML comments if needed 296 if self.return_raw_comments: 297 ann_key = 'comment_%s_xml' % element.attrib['type'].replace(' ', '') 298 append_to_annotations(ann_key, ElementTree.tostring(element)) 299 300 def _parse_dbReference(element): 301 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id']) 302 # e.g. 303 # <dbReference type="PDB" key="11" id="2GEZ"> 304 # <property value="X-ray" type="method"/> 305 # <property value="2.60 A" type="resolution"/> 306 # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 307 # </dbReference> 308 if 'type' in element.attrib: 309 if element.attrib['type'] == 'PDB': 310 method = "" 311 resolution = "" 312 for ref_element in element: 313 if ref_element.tag == NS + 'property': 314 dat_type = ref_element.attrib['type'] 315 if dat_type == 'method': 316 method = ref_element.attrib['value'] 317 if dat_type == 'resolution': 318 resolution = ref_element.attrib['value'] 319 if dat_type == 'chains': 320 pairs = ref_element.attrib['value'].split(',') 321 for elem in pairs: 322 pair = elem.strip().split('=') 323 if pair[1] != '-': 324 # TODO - How best to store these, do SeqFeatures make sense? 325 feature = SeqFeature.SeqFeature() 326 feature.type = element.attrib['type'] 327 feature.qualifiers['name'] = element.attrib['id'] 328 feature.qualifiers['method'] = method 329 feature.qualifiers['resolution'] = resolution 330 feature.qualifiers['chains'] = pair[0].split('/') 331 start = int(pair[1].split('-')[0]) - 1 332 end = int(pair[1].split('-')[1]) 333 feature.location = SeqFeature.FeatureLocation(start, end) 334 # self.ParsedSeqRecord.features.append(feature) 335 336 for ref_element in element: 337 if ref_element.tag == NS + 'property': 338 pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 339 340 def _parse_reference(element): 341 reference = SeqFeature.Reference() 342 authors = [] 343 scopes = [] 344 tissues = [] 345 journal_name = '' 346 pub_type = '' 347 pub_date = '' 348 for ref_element in element: 349 if ref_element.tag == NS + 'citation': 350 pub_type = ref_element.attrib['type'] 351 if pub_type == 'submission': 352 pub_type += ' to the ' + ref_element.attrib['db'] 353 if 'name' in ref_element.attrib: 354 journal_name = ref_element.attrib['name'] 355 pub_date = ref_element.attrib.get('date', '') 356 j_volume = ref_element.attrib.get('volume', '') 357 j_first = ref_element.attrib.get('first', '') 358 j_last = ref_element.attrib.get('last', '') 359 for cit_element in ref_element: 360 if cit_element.tag == NS + 'title': 361 reference.title = cit_element.text 362 elif cit_element.tag == NS + 'authorList': 363 for person_element in cit_element: 364 authors.append(person_element.attrib['name']) 365 elif cit_element.tag == NS + 'dbReference': 366 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type'] + 367 ':' + cit_element.attrib['id']) 368 if cit_element.attrib['type'] == 'PubMed': 369 reference.pubmed_id = cit_element.attrib['id'] 370 elif ref_element.attrib['type'] == 'MEDLINE': 371 reference.medline_id = cit_element.attrib['id'] 372 elif ref_element.tag == NS + 'scope': 373 scopes.append(ref_element.text) 374 elif ref_element.tag == NS + 'source': 375 for source_element in ref_element: 376 if source_element.tag == NS + 'tissue': 377 tissues.append(source_element.text) 378 if scopes: 379 scopes_str = 'Scope: ' + ', '.join(scopes) 380 else: 381 scopes_str = '' 382 if tissues: 383 tissues_str = 'Tissue: ' + ', '.join(tissues) 384 else: 385 tissues_str = '' 386 387 # locations cannot be parsed since they are actually written in 388 # free text inside scopes so all the references are put in the 389 # annotation. 390 reference.location = [] 391 reference.authors = ', '.join(authors) 392 if journal_name: 393 if pub_date and j_volume and j_first and j_last: 394 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, 395 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) 396 else: 397 reference.journal = journal_name 398 reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str)) 399 append_to_annotations('references', reference) 400 401 def _parse_position(element, offset=0): 402 try: 403 position = int(element.attrib['position']) + offset 404 except KeyError as err: 405 position = None 406 status = element.attrib.get('status', '') 407 if status == 'unknown': 408 assert position is None 409 return SeqFeature.UnknownPosition() 410 elif not status: 411 return SeqFeature.ExactPosition(position) 412 elif status == 'greater than': 413 return SeqFeature.AfterPosition(position) 414 elif status == 'less than': 415 return SeqFeature.BeforePosition(position) 416 elif status == 'uncertain': 417 return SeqFeature.UncertainPosition(position) 418 else: 419 raise NotImplementedError("Position status %r" % status) 420 421 def _parse_feature(element): 422 feature = SeqFeature.SeqFeature() 423 for k, v in element.attrib.items(): 424 feature.qualifiers[k] = v 425 feature.type = element.attrib.get('type', '') 426 if 'id' in element.attrib: 427 feature.id = element.attrib['id'] 428 for feature_element in element: 429 if feature_element.tag == NS + 'location': 430 position_elements = feature_element.findall(NS + 'position') 431 if position_elements: 432 element = position_elements[0] 433 start_position = _parse_position(element, -1) 434 end_position = _parse_position(element) 435 else: 436 element = feature_element.findall(NS + 'begin')[0] 437 start_position = _parse_position(element, -1) 438 element = feature_element.findall(NS + 'end')[0] 439 end_position = _parse_position(element) 440 feature.location = SeqFeature.FeatureLocation(start_position, end_position) 441 else: 442 try: 443 feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text 444 except Exception: # TODO - Which exceptions? 445 pass # skip unparsable tag 446 self.ParsedSeqRecord.features.append(feature) 447 448 def _parse_proteinExistence(element): 449 append_to_annotations('proteinExistence', element.attrib['type']) 450 451 def _parse_evidence(element): 452 for k, v in element.attrib.items(): 453 ann_key = k 454 append_to_annotations(ann_key, v) 455 456 def _parse_sequence(element): 457 for k, v in element.attrib.items(): 458 if k in ("length", "mass", "version"): 459 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v) 460 else: 461 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v 462 seq = ''.join((element.text.split())) 463 self.ParsedSeqRecord.seq = Seq.Seq(seq, self.alphabet) 464 465 # ============================================# 466 # Initialize SeqRecord 467 self.ParsedSeqRecord = SeqRecord('', id='') 468 469 # Entry attribs parsing 470 # Unknown dataset should not happen! 471 self.dbname = self.entry.attrib.get('dataset', 'UnknownDataset') 472 # add attribs to annotations 473 for k, v in self.entry.attrib.items(): 474 if k in ("version"): 475 # original 476 # self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 477 # To cope with swissProt plain text parser. this can cause errors 478 # if the attrib has the same name of an other annotation 479 self.ParsedSeqRecord.annotations[k] = int(v) 480 else: 481 # self.ParsedSeqRecord.annotations["entry_%s" % k] = v 482 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser 483 484 # Top-to-bottom entry children parsing 485 for element in self.entry: 486 if element.tag == NS + 'name': 487 _parse_name(element) 488 elif element.tag == NS + 'accession': 489 _parse_accession(element) 490 elif element.tag == NS + 'protein': 491 _parse_protein(element) 492 elif element.tag == NS + 'gene': 493 _parse_gene(element) 494 elif element.tag == NS + 'geneLocation': 495 _parse_geneLocation(element) 496 elif element.tag == NS + 'organism': 497 _parse_organism(element) 498 elif element.tag == NS + 'organismHost': 499 _parse_organismHost(element) 500 elif element.tag == NS + 'keyword': 501 _parse_keyword(element) 502 elif element.tag == NS + 'comment': 503 _parse_comment(element) 504 elif element.tag == NS + 'dbReference': 505 _parse_dbReference(element) 506 elif element.tag == NS + 'reference': 507 _parse_reference(element) 508 elif element.tag == NS + 'feature': 509 _parse_feature(element) 510 elif element.tag == NS + 'proteinExistence': 511 _parse_proteinExistence(element) 512 elif element.tag == NS + 'evidence': 513 _parse_evidence(element) 514 elif element.tag == NS + 'sequence': 515 _parse_sequence(element) 516 else: 517 pass 518 519 # remove duplicate dbxrefs 520 self.ParsedSeqRecord.dbxrefs = sorted(list(set(self.ParsedSeqRecord.dbxrefs))) 521 522 # use first accession as id 523 if not self.ParsedSeqRecord.id: 524 self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations['accessions'][0] 525 526 return self.ParsedSeqRecord 527