1
2
3
4
5
6
7
8
9 """Bio.SeqIO support for the "uniprot-xml" file format.
10
11 See also:
12
13 http://www.uniprot.org
14
15 The UniProt XML format essentially replaces the old plain text file format
16 originally introduced by SwissProt ("swiss" format in Bio.SeqIO).
17 """
18 import sys
19
20 from Bio import Seq
21 from Bio import SeqFeature
22 from Bio import Alphabet
23 from Bio.SeqRecord import SeqRecord
24 try:
25 from cStringIO import StringIO
26 except ImportError:
27 from StringIO import StringIO
28
29
30 try:
31 if (3, 0) <= sys.version_info[:2] <= (3, 1):
32
33
34 from xml.etree import ElementTree as ElementTree
35 else:
36 from xml.etree import cElementTree as ElementTree
37 except ImportError:
38 from xml.etree import ElementTree as ElementTree
39
40 NS = "{http://uniprot.org/uniprot}"
41 REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)"
42
43
45 """Generator function to parse UniProt XML as SeqRecord objects.
46
47 parses an XML entry at a time from any UniProt XML file
48 returns a SeqRecord for each iteration
49
50 This generator can be used in Bio.SeqIO
51
52 return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
53 skip_parsing_errors = True --> if parsing errors are found, skip to next entry
54 """
55 if isinstance(alphabet, Alphabet.NucleotideAlphabet):
56 raise ValueError("Wrong alphabet %r" % alphabet)
57 if isinstance(alphabet, Alphabet.Gapped):
58 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet):
59 raise ValueError("Wrong alphabet %r" % alphabet)
60
61 if not hasattr(handle, "read"):
62 if isinstance(handle, str):
63 handle = StringIO(handle)
64 else:
65 raise Exception('An XML-containing handler or an XML string must be passed')
66
67 if ElementTree is None:
68 from Bio import MissingExternalDependencyError
69 raise MissingExternalDependencyError(
70 "No ElementTree module was found. "
71 "Use Python 2.5+, lxml or elementtree if you "
72 "want to use Bio.SeqIO.UniprotIO.")
73
74 for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
75 if event == "end" and elem.tag == NS + "entry":
76 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse()
77 elem.clear()
78
79
81 """Parse a UniProt XML entry to a SeqRecord.
82
83 return_raw_comments=True to get back the complete comment field in XML format
84 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet.
85 """
90
100
101 def _parse_name(element):
102 self.ParsedSeqRecord.name = element.text
103 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)
104
105 def _parse_accession(element):
106 append_to_annotations('accessions', element.text)
107 self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text)
108
109 def _parse_protein(element):
110 """Parse protein names (PRIVATE)."""
111 descr_set = False
112 for protein_element in element:
113 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:
114
115 for rec_name in protein_element:
116 ann_key = '%s_%s' % (protein_element.tag.replace(NS, ''),
117 rec_name.tag.replace(NS, ''))
118 append_to_annotations(ann_key, rec_name.text)
119 if (rec_name.tag == NS + 'fullName') and not descr_set:
120 self.ParsedSeqRecord.description = rec_name.text
121 descr_set = True
122 elif protein_element.tag == NS + 'component':
123 pass
124 elif protein_element.tag == NS + 'domain':
125 pass
126
127 def _parse_gene(element):
128 for genename_element in element:
129 if 'type' in genename_element.attrib:
130 ann_key = 'gene_%s_%s' % (genename_element.tag.replace(NS, ''),
131 genename_element.attrib['type'])
132 if genename_element.attrib['type'] == 'primary':
133 self.ParsedSeqRecord.annotations[ann_key] = genename_element.text
134 else:
135 append_to_annotations(ann_key, genename_element.text)
136
137 def _parse_geneLocation(element):
138 append_to_annotations('geneLocation', element.attrib['type'])
139
140 def _parse_organism(element):
141 organism_name = com_name = sci_name = ''
142 for organism_element in element:
143 if organism_element.tag == NS + 'name':
144 if organism_element.text:
145 if organism_element.attrib['type'] == 'scientific':
146 sci_name = organism_element.text
147 elif organism_element.attrib['type'] == 'common':
148 com_name = organism_element.text
149 else:
150
151 append_to_annotations("organism_name", organism_element.text)
152 elif organism_element.tag == NS + 'dbReference':
153 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type'] + ':' + organism_element.attrib['id'])
154 elif organism_element.tag == NS + 'lineage':
155 for taxon_element in organism_element:
156 if taxon_element.tag == NS + 'taxon':
157 append_to_annotations('taxonomy', taxon_element.text)
158 if sci_name and com_name:
159 organism_name = '%s (%s)' % (sci_name, com_name)
160 elif sci_name:
161 organism_name = sci_name
162 elif com_name:
163 organism_name = com_name
164 self.ParsedSeqRecord.annotations['organism'] = organism_name
165
166 def _parse_organismHost(element):
167 for organism_element in element:
168 if organism_element.tag == NS + 'name':
169 append_to_annotations("organism_host", organism_element.text)
170
171 def _parse_keyword(element):
172 append_to_annotations('keywords', element.text)
173
174 def _parse_comment(element):
175 """Parse comments (PRIVATE).
176
177 Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
178 To store all the contained data, more complex data structures are needed, such as
179 annotated dictionaries. This is left to end user, by optionally setting:
180
181 return_raw_comments=True
182
183 The original XML is returned in the annotation fields.
184
185 Available comment types at december 2009:
186 "allergen"
187 "alternative products"
188 "biotechnology"
189 "biophysicochemical properties"
190 "catalytic activity"
191 "caution"
192 "cofactor"
193 "developmental stage"
194 "disease"
195 "domain"
196 "disruption phenotype"
197 "enzyme regulation"
198 "function"
199 "induction"
200 "miscellaneous"
201 "pathway"
202 "pharmaceutical"
203 "polymorphism"
204 "PTM"
205 "RNA editing"
206 "similarity"
207 "subcellular location"
208 "sequence caution"
209 "subunit"
210 "tissue specificity"
211 "toxic dose"
212 "online information"
213 "mass spectrometry"
214 "interaction"
215 """
216
217 simple_comments = ["allergen",
218 "biotechnology",
219 "biophysicochemical properties",
220 "catalytic activity",
221 "caution",
222 "cofactor",
223 "developmental stage",
224 "disease",
225 "domain",
226 "disruption phenotype",
227 "enzyme regulation",
228 "function",
229 "induction",
230 "miscellaneous",
231 "pathway",
232 "pharmaceutical",
233 "polymorphism",
234 "PTM",
235 "RNA editing",
236 "similarity",
237 "subunit",
238 "tissue specificity",
239 "toxic dose",
240 ]
241
242 if element.attrib['type'] in simple_comments:
243 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '')
244 for text_element in element.getiterator(NS + 'text'):
245 if text_element.text:
246 append_to_annotations(ann_key, text_element.text)
247 elif element.attrib['type'] == 'subcellular location':
248 for subloc_element in element.getiterator(NS + 'subcellularLocation'):
249 for el in subloc_element:
250 if el.text:
251 ann_key = 'comment_%s_%s' % (element.attrib['type'].replace(' ', ''), el.tag.replace(NS, ''))
252 append_to_annotations(ann_key, el.text)
253 elif element.attrib['type'] == 'interaction':
254 for interact_element in element.getiterator(NS + 'interactant'):
255 ann_key = 'comment_%s_intactId' % element.attrib['type']
256 append_to_annotations(ann_key, interact_element.attrib['intactId'])
257 elif element.attrib['type'] == 'alternative products':
258 for alt_element in element.getiterator(NS + 'isoform'):
259 ann_key = 'comment_%s_isoform' % element.attrib['type'].replace(' ', '')
260 for id_element in alt_element.getiterator(NS + 'id'):
261 append_to_annotations(ann_key, id_element.text)
262 elif element.attrib['type'] == 'mass spectrometry':
263 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '')
264 start = end = 0
265 for loc_element in element.getiterator(NS + 'location'):
266 pos_els = loc_element.getiterator(NS + 'position')
267 pos_els = list(pos_els)
268
269 try:
270 if pos_els:
271 end = int(pos_els[0].attrib['position'])
272 start = end - 1
273 else:
274 start = int(list(loc_element.getiterator(NS + 'begin'))[0].attrib['position']) - 1
275 end = int(list(loc_element.getiterator(NS + 'end'))[0].attrib['position'])
276 except:
277 pass
278 mass = element.attrib['mass']
279 method = element.attrib['method']
280 if start == end == 0:
281 append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method))
282 else:
283 append_to_annotations(ann_key, '%s..%s:%s|%s' % (start, end, mass, method))
284 elif element.attrib['type'] == 'sequence caution':
285 pass
286 elif element.attrib['type'] == 'online information':
287 for link_element in element.getiterator(NS + 'link'):
288 ann_key = 'comment_%s' % element.attrib['type'].replace(' ', '')
289 for id_element in link_element.getiterator(NS + 'link'):
290 append_to_annotations(ann_key,
291 '%s@%s' % (element.attrib['name'], link_element.attrib['uri']))
292
293
294 if self.return_raw_comments:
295 ann_key = 'comment_%s_xml' % element.attrib['type'].replace(' ', '')
296 append_to_annotations(ann_key, ElementTree.tostring(element))
297
298 def _parse_dbReference(element):
299 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id'])
300
301
302
303
304
305
306 if 'type' in element.attrib:
307 if element.attrib['type'] == 'PDB':
308 method = ""
309 resolution = ""
310 for ref_element in element:
311 if ref_element.tag == NS + 'property':
312 dat_type = ref_element.attrib['type']
313 if dat_type == 'method':
314 method = ref_element.attrib['value']
315 if dat_type == 'resolution':
316 resolution = ref_element.attrib['value']
317 if dat_type == 'chains':
318 pairs = ref_element.attrib['value'].split(',')
319 for elem in pairs:
320 pair = elem.strip().split('=')
321 if pair[1] != '-':
322
323 feature = SeqFeature.SeqFeature()
324 feature.type = element.attrib['type']
325 feature.qualifiers['name'] = element.attrib['id']
326 feature.qualifiers['method'] = method
327 feature.qualifiers['resolution'] = resolution
328 feature.qualifiers['chains'] = pair[0].split('/')
329 start = int(pair[1].split('-')[0]) - 1
330 end = int(pair[1].split('-')[1])
331 feature.location = SeqFeature.FeatureLocation(start, end)
332
333
334 for ref_element in element:
335 if ref_element.tag == NS + 'property':
336 pass
337
338 def _parse_reference(element):
339 reference = SeqFeature.Reference()
340 authors = []
341 scopes = []
342 tissues = []
343 journal_name = ''
344 pub_type = ''
345 pub_date = ''
346 for ref_element in element:
347 if ref_element.tag == NS + 'citation':
348 pub_type = ref_element.attrib['type']
349 if pub_type == 'submission':
350 pub_type += ' to the ' + ref_element.attrib['db']
351 if 'name' in ref_element.attrib:
352 journal_name = ref_element.attrib['name']
353 pub_date = ref_element.attrib.get('date', '')
354 j_volume = ref_element.attrib.get('volume', '')
355 j_first = ref_element.attrib.get('first', '')
356 j_last = ref_element.attrib.get('last', '')
357 for cit_element in ref_element:
358 if cit_element.tag == NS + 'title':
359 reference.title = cit_element.text
360 elif cit_element.tag == NS + 'authorList':
361 for person_element in cit_element:
362 authors.append(person_element.attrib['name'])
363 elif cit_element.tag == NS + 'dbReference':
364 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']
365 + ':' + cit_element.attrib['id'])
366 if cit_element.attrib['type'] == 'PubMed':
367 reference.pubmed_id = cit_element.attrib['id']
368 elif ref_element.attrib['type'] == 'MEDLINE':
369 reference.medline_id = cit_element.attrib['id']
370 elif ref_element.tag == NS + 'scope':
371 scopes.append(ref_element.text)
372 elif ref_element.tag == NS + 'source':
373 for source_element in ref_element:
374 if source_element.tag == NS + 'tissue':
375 tissues.append(source_element.text)
376 if scopes:
377 scopes_str = 'Scope: ' + ', '.join(scopes)
378 else:
379 scopes_str = ''
380 if tissues:
381 tissues_str = 'Tissue: ' + ', '.join(tissues)
382 else:
383 tissues_str = ''
384
385
386
387
388 reference.location = []
389 reference.authors = ', '.join(authors)
390 if journal_name:
391 if pub_date and j_volume and j_first and j_last:
392 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
393 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
394 else:
395 reference.journal = journal_name
396 reference.comment = ' | '.join((pub_type, pub_date, scopes_str, tissues_str))
397 append_to_annotations('references', reference)
398
399 def _parse_position(element, offset=0):
400 try:
401 position = int(element.attrib['position']) + offset
402 except KeyError, err:
403 position = None
404 status = element.attrib.get('status', '')
405 if status == 'unknown':
406 assert position is None
407 return SeqFeature.UnknownPosition()
408 elif not status:
409 return SeqFeature.ExactPosition(position)
410 elif status == 'greater than':
411 return SeqFeature.AfterPosition(position)
412 elif status == 'less than':
413 return SeqFeature.BeforePosition(position)
414 elif status == 'uncertain':
415 return SeqFeature.UncertainPosition(position)
416 else:
417 raise NotImplementedError("Position status %r" % status)
418
419 def _parse_feature(element):
420 feature = SeqFeature.SeqFeature()
421 for k, v in element.attrib.items():
422 feature.qualifiers[k] = v
423 feature.type = element.attrib.get('type', '')
424 if 'id' in element.attrib:
425 feature.id = element.attrib['id']
426 for feature_element in element:
427 if feature_element.tag == NS + 'location':
428 position_elements = feature_element.findall(NS + 'position')
429 if position_elements:
430 element = position_elements[0]
431 start_position = _parse_position(element, -1)
432 end_position = _parse_position(element)
433 else:
434 element = feature_element.findall(NS + 'begin')[0]
435 start_position = _parse_position(element, -1)
436 element = feature_element.findall(NS + 'end')[0]
437 end_position = _parse_position(element)
438 feature.location = SeqFeature.FeatureLocation(start_position, end_position)
439 else:
440 try:
441 feature.qualifiers[feature_element.tag.replace(NS, '')] = feature_element.text
442 except:
443 pass
444 self.ParsedSeqRecord.features.append(feature)
445
446 def _parse_proteinExistence(element):
447 append_to_annotations('proteinExistence', element.attrib['type'])
448
449 def _parse_evidence(element):
450 for k, v in element.attrib.items():
451 ann_key = k
452 append_to_annotations(ann_key, v)
453
454 def _parse_sequence(element):
455 for k, v in element.attrib.items():
456 if k in ("length", "mass", "version"):
457 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v)
458 else:
459 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v
460 seq = ''.join((element.text.split()))
461 self.ParsedSeqRecord.seq = Seq.Seq(seq, self.alphabet)
462
463
464
465 self.ParsedSeqRecord = SeqRecord('', id='')
466
467
468
469 self.dbname = self.entry.attrib.get('dataset', 'UnknownDataset')
470
471 for k, v in self.entry.attrib.items():
472 if k in ("version"):
473
474
475
476
477 self.ParsedSeqRecord.annotations[k] = int(v)
478 else:
479
480 self.ParsedSeqRecord.annotations[k] = v
481
482
483 for element in self.entry:
484 if element.tag == NS + 'name':
485 _parse_name(element)
486 elif element.tag == NS + 'accession':
487 _parse_accession(element)
488 elif element.tag == NS + 'protein':
489 _parse_protein(element)
490 elif element.tag == NS + 'gene':
491 _parse_gene(element)
492 elif element.tag == NS + 'geneLocation':
493 _parse_geneLocation(element)
494 elif element.tag == NS + 'organism':
495 _parse_organism(element)
496 elif element.tag == NS + 'organismHost':
497 _parse_organismHost(element)
498 elif element.tag == NS + 'keyword':
499 _parse_keyword(element)
500 elif element.tag == NS + 'comment':
501 _parse_comment(element)
502 elif element.tag == NS + 'dbReference':
503 _parse_dbReference(element)
504 elif element.tag == NS + 'reference':
505 _parse_reference(element)
506 elif element.tag == NS + 'feature':
507 _parse_feature(element)
508 elif element.tag == NS + 'proteinExistence':
509 _parse_proteinExistence(element)
510 elif element.tag == NS + 'evidence':
511 _parse_evidence(element)
512 elif element.tag == NS + 'sequence':
513 _parse_sequence(element)
514 else:
515 pass
516
517
518 self.ParsedSeqRecord.dbxrefs = sorted(list(set(self.ParsedSeqRecord.dbxrefs)))
519
520
521 if not self.ParsedSeqRecord.id:
522 self.ParsedSeqRecord.id = self.ParsedSeqRecord.annotations['accessions'][0]
523
524 return self.ParsedSeqRecord
525