1
2
3
4
5
6 """PhyloXML reader/parser, writer, and associated functions.
7
8 Instantiates tree elements from a parsed PhyloXML file, and constructs an XML
9 file from a `Bio.Phylo.PhyloXML` object.
10
11 About capitalization:
12
13 - phyloXML means the file format specification
14 - PhyloXML means the Biopython module `Bio.Phylo.PhyloXML` and its classes
15 - Phyloxml means the top-level class used by `PhyloXMLIO.read` (but not
16 `Bio.Phylo.read`!), containing a list of Phylogenies (objects derived from
17 `BaseTree.Tree`)
18 """
19 __docformat__ = "restructuredtext en"
20
21 import sys
22
23 from Bio.Phylo import PhyloXML as PX
24
25
26 try:
27 if (3, 0) <= sys.version_info[:2] <= (3, 1):
28
29
30 from xml.etree import ElementTree as ElementTree
31 else:
32 from xml.etree import cElementTree as ElementTree
33 except ImportError:
34 from xml.etree import ElementTree as ElementTree
35
36
37
38 NAMESPACES = {
39 'phy': 'http://www.phyloxml.org',
40 }
41
42 try:
43 register_namespace = ElementTree.register_namespace
44 except AttributeError:
45 if not hasattr(ElementTree, '_namespace_map'):
46
47 from xml.etree import ElementTree as ET_py
48 ElementTree._namespace_map = ET_py._namespace_map
49
52
53 for prefix, uri in NAMESPACES.iteritems():
54 register_namespace(prefix, uri)
55
56
58 """Exception raised when PhyloXML object construction cannot continue.
59
60 XML syntax errors will be found and raised by the underlying ElementTree
61 module; this exception is for valid XML that breaks the phyloXML
62 specification.
63 """
64 pass
65
66
67
68
69
71 """Parse a phyloXML file or stream and build a tree of Biopython objects.
72
73 The children of the root node are phylogenies and possibly other arbitrary
74 (non-phyloXML) objects.
75
76 :returns: a single `Bio.Phylo.PhyloXML.Phyloxml` object.
77 """
78 return Parser(file).read()
79
80
82 """Iterate over the phylogenetic trees in a phyloXML file.
83
84 This ignores any additional data stored at the top level, but may be more
85 memory-efficient than the `read` function.
86
87 :returns: a generator of `Bio.Phylo.PhyloXML.Phylogeny` objects.
88 """
89 return Parser(file).parse()
90
91
92 -def write(obj, file, encoding='utf-8', indent=True):
93 """Write a phyloXML file.
94
95 :Parameters:
96 obj
97 an instance of `Phyloxml`, `Phylogeny` or `BaseTree.Tree`, or an
98 iterable of either of the latter two. The object will be converted
99 to a Phyloxml object before serialization.
100 file
101 either an open handle or a file name.
102 """
103 def fix_single(tree):
104 if isinstance(tree, PX.Phylogeny):
105 return tree
106 if isinstance(tree, PX.Clade):
107 return tree.to_phylogeny()
108 if isinstance(tree, PX.BaseTree.Tree):
109 return PX.Phylogeny.from_tree(tree)
110 if isinstance(tree, PX.BaseTree.Clade):
111 return PX.Phylogeny.from_tree(PX.BaseTree.Tree(root=tree))
112 else:
113 raise ValueError("iterable must contain Tree or Clade types")
114
115 if isinstance(obj, PX.Phyloxml):
116 pass
117 elif (isinstance(obj, PX.BaseTree.Tree) or
118 isinstance(obj, PX.BaseTree.Clade)):
119 obj = fix_single(obj).to_phyloxml()
120 elif hasattr(obj, '__iter__'):
121 obj = PX.Phyloxml({}, phylogenies=(fix_single(t) for t in obj))
122 else:
123 raise ValueError("First argument must be a Phyloxml, Phylogeny, "
124 "Tree, or iterable of Trees or Phylogenies.")
125 return Writer(obj).write(file, encoding=encoding, indent=indent)
126
127
128
129
130
132 """Extract the local tag from a namespaced tag name."""
133 if tag[0] == '{':
134 return tag[tag.index('}')+1:]
135 return tag
136
137
139 """Split a tag into namespace and local tag strings."""
140 try:
141 return tag[1:].split('}', 1)
142 except:
143 return ('', tag)
144
145
147 """Format an XML tag with the given namespace."""
148 return '{%s}%s' % (namespace, tag)
149
150
152 """Find a child node by tag, and pass it through a constructor.
153
154 Returns None if no matching child is found.
155 """
156 child = parent.find(_ns(tag))
157 if child is not None:
158 return construct(child)
159
160
161 -def _get_child_text(parent, tag, construct=unicode):
162 """Find a child node by tag; pass its text through a constructor.
163
164 Returns None if no matching child is found.
165 """
166 child = parent.find(_ns(tag))
167 if child is not None and child.text:
168 return construct(child.text)
169
170
172 """Find child nodes by tag; pass each through a constructor.
173
174 Returns an empty list if no matching child is found.
175 """
176 return [construct(child) for child in
177 parent.findall(_ns(tag))]
178
179
180 -def _get_children_text(parent, tag, construct=unicode):
181 """Find child nodes by tag; pass each node's text through a constructor.
182
183 Returns an empty list if no matching child is found.
184 """
185 return [construct(child.text) for child in
186 parent.findall(_ns(tag))
187 if child.text]
188
189
191 """Add line breaks and indentation to ElementTree in-place.
192
193 Sources:
194
195 - http://effbot.org/zone/element-lib.htm#prettyprint
196 - http://infix.se/2007/02/06/gentlemen-indent-your-xml
197 """
198 i = "\n" + level*" "
199 if len(elem):
200 if not elem.text or not elem.text.strip():
201 elem.text = i + " "
202 for e in elem:
203 _indent(e, level+1)
204 if not e.tail or not e.tail.strip():
205 e.tail = i + " "
206 if not e.tail or not e.tail.strip():
207 e.tail = i
208 else:
209 if level and (not elem.tail or not elem.tail.strip()):
210 elem.tail = i
211
212
213
214
215
216
218 if text == 'true':
219 return True
220 if text == 'false':
221 return False
222 raise ValueError('String could not be converted to boolean: ' + text)
223
224
226 out = dct.copy()
227 for key in keys:
228 if key in out:
229 out[key] = _str2bool(out[key])
230 return out
231
232
234 if text is not None:
235 try:
236 return int(text)
237 except Exception:
238 return None
239
240
242 if text is not None:
243 try:
244 return float(text)
245 except Exception:
246 return None
247
248
250 """Replace all spans of whitespace with a single space character.
251
252 Also remove leading and trailing whitespace. See "Collapse Whitespace
253 Policy" in the phyloXML spec glossary:
254 http://phyloxml.org/documentation/version_100/phyloxml.xsd.html#Glossary
255 """
256 if text is not None:
257 return ' '.join(text.split())
258
259
260
262 """Replace tab, LF and CR characters with spaces, but don't collapse.
263
264 See "Replace Whitespace Policy" in the phyloXML spec glossary:
265 http://phyloxml.org/documentation/version_100/phyloxml.xsd.html#Glossary
266 """
267 for char in ('\t', '\n', '\r'):
268 if char in text:
269 text = text.replace(char, ' ')
270 return text
271
272
274 """Methods for parsing all phyloXML nodes from an XML stream.
275
276 To minimize memory use, the tree of ElementTree parsing events is cleared
277 after completing each phylogeny, clade, and top-level 'other' element.
278 Elements below the clade level are kept in memory until parsing of the
279 current clade is finished -- this shouldn't be a problem because clade is
280 the only recursive element, and non-clade nodes below this level are of
281 bounded size.
282 """
283
285
286 context = iter(ElementTree.iterparse(file, events=('start', 'end')))
287 event, root = context.next()
288 self.root = root
289 self.context = context
290
314
316 """Parse the phyloXML file incrementally and return each phylogeny."""
317 phytag = _ns('phylogeny')
318 for event, elem in self.context:
319 if event == 'start' and elem.tag == phytag:
320 yield self._parse_phylogeny(elem)
321
322
323
325 """Parse a single phylogeny within the phyloXML tree.
326
327 Recursively builds a phylogenetic tree with help from parse_clade, then
328 clears the XML event history for the phylogeny element and returns
329 control to the top-level parsing function.
330 """
331 phylogeny = PX.Phylogeny(**_dict_str2bool(parent.attrib,
332 ['rooted', 'rerootable']))
333 list_types = {
334
335 'confidence': 'confidences',
336 'property': 'properties',
337 'clade_relation': 'clade_relations',
338 'sequence_relation': 'sequence_relations',
339 }
340 for event, elem in self.context:
341 namespace, tag = _split_namespace(elem.tag)
342 if event == 'start' and tag == 'clade':
343 assert phylogeny.root is None, \
344 "Phylogeny object should only have 1 clade"
345 phylogeny.root = self._parse_clade(elem)
346 continue
347 if event == 'end':
348 if tag == 'phylogeny':
349 parent.clear()
350 break
351
352 if tag in list_types:
353 getattr(phylogeny, list_types[tag]).append(
354 getattr(self, tag)(elem))
355
356 elif tag in ('date', 'id'):
357 setattr(phylogeny, tag, getattr(self, tag)(elem))
358
359 elif tag in ('name', 'description'):
360 setattr(phylogeny, tag, _collapse_wspace(elem.text))
361
362 elif namespace != NAMESPACES['phy']:
363 phylogeny.other.append(self.other(elem, namespace, tag))
364 parent.clear()
365 else:
366
367 raise PhyloXMLError('Misidentified tag: ' + tag)
368 return phylogeny
369
370 _clade_complex_types = ['color', 'events', 'binary_characters', 'date']
371 _clade_list_types = {
372 'confidence': 'confidences',
373 'distribution': 'distributions',
374 'reference': 'references',
375 'property': 'properties',
376 }
377 _clade_tracked_tags = set(_clade_complex_types + _clade_list_types.keys()
378 + ['branch_length', 'name', 'node_id', 'width'])
379
381 """Parse a Clade node and its children, recursively."""
382 clade = PX.Clade(**parent.attrib)
383 if clade.branch_length is not None:
384 clade.branch_length = float(clade.branch_length)
385
386 tag_stack = []
387 for event, elem in self.context:
388 namespace, tag = _split_namespace(elem.tag)
389 if event == 'start':
390 if tag == 'clade':
391 clade.clades.append(self._parse_clade(elem))
392 continue
393 if tag == 'taxonomy':
394 clade.taxonomies.append(self._parse_taxonomy(elem))
395 continue
396 if tag == 'sequence':
397 clade.sequences.append(self._parse_sequence(elem))
398 continue
399 if tag in self._clade_tracked_tags:
400 tag_stack.append(tag)
401 if event == 'end':
402 if tag == 'clade':
403 elem.clear()
404 break
405 if tag != tag_stack[-1]:
406 continue
407 tag_stack.pop()
408
409 if tag in self._clade_list_types:
410 getattr(clade, self._clade_list_types[tag]).append(
411 getattr(self, tag)(elem))
412 elif tag in self._clade_complex_types:
413 setattr(clade, tag, getattr(self, tag)(elem))
414 elif tag == 'branch_length':
415
416 if clade.branch_length is not None:
417 raise PhyloXMLError(
418 'Attribute branch_length was already set '
419 'for this Clade.')
420 clade.branch_length = _float(elem.text)
421 elif tag == 'width':
422 clade.width = _float(elem.text)
423 elif tag == 'name':
424 clade.name = _collapse_wspace(elem.text)
425 elif tag == 'node_id':
426 clade.node_id = PX.Id(elem.text.strip(),
427 elem.attrib.get('provider'))
428 elif namespace != NAMESPACES['phy']:
429 clade.other.append(self.other(elem, namespace, tag))
430 elem.clear()
431 else:
432 raise PhyloXMLError('Misidentified tag: ' + tag)
433 return clade
434
436 sequence = PX.Sequence(**parent.attrib)
437 for event, elem in self.context:
438 namespace, tag = _split_namespace(elem.tag)
439 if event == 'end':
440 if tag == 'sequence':
441 parent.clear()
442 break
443 if tag in ('accession', 'mol_seq', 'uri',
444 'domain_architecture'):
445 setattr(sequence, tag, getattr(self, tag)(elem))
446 elif tag == 'annotation':
447 sequence.annotations.append(self.annotation(elem))
448 elif tag == 'name':
449 sequence.name = _collapse_wspace(elem.text)
450 elif tag in ('symbol', 'location'):
451 setattr(sequence, tag, elem.text)
452 elif namespace != NAMESPACES['phy']:
453 sequence.other.append(self.other(elem, namespace, tag))
454 parent.clear()
455 return sequence
456
458 taxonomy = PX.Taxonomy(**parent.attrib)
459 for event, elem in self.context:
460 namespace, tag = _split_namespace(elem.tag)
461 if event == 'end':
462 if tag == 'taxonomy':
463 parent.clear()
464 break
465 if tag in ('id', 'uri'):
466 setattr(taxonomy, tag, getattr(self, tag)(elem))
467 elif tag == 'common_name':
468 taxonomy.common_names.append(_collapse_wspace(elem.text))
469 elif tag == 'synonym':
470 taxonomy.synonyms.append(elem.text)
471 elif tag in ('code', 'scientific_name', 'authority', 'rank'):
472
473 setattr(taxonomy, tag, elem.text)
474 elif namespace != NAMESPACES['phy']:
475 taxonomy.other.append(self.other(elem, namespace, tag))
476 parent.clear()
477 return taxonomy
478
479 - def other(self, elem, namespace, localtag):
484
485
486
489
491 return PX.Annotation(
492 desc=_collapse_wspace(_get_child_text(elem, 'desc')),
493 confidence=_get_child_as(elem, 'confidence', self.confidence),
494 properties=_get_children_as(elem, 'property', self.property),
495 uri=_get_child_as(elem, 'uri', self.uri),
496 **elem.attrib)
497
501 return PX.BinaryCharacters(
502 type=elem.get('type'),
503 gained_count=_int(elem.get('gained_count')),
504 lost_count=_int(elem.get('lost_count')),
505 present_count=_int(elem.get('present_count')),
506 absent_count=_int(elem.get('absent_count')),
507
508 gained=_get_child_as(elem, 'gained', bc_getter),
509 lost=_get_child_as(elem, 'lost', bc_getter),
510 present=_get_child_as(elem, 'present', bc_getter),
511 absent=_get_child_as(elem, 'absent', bc_getter))
512
518
523
528
529 - def date(self, elem):
530 return PX.Date(
531 unit=elem.get('unit'),
532 desc=_collapse_wspace(_get_child_text(elem, 'desc')),
533 value=_get_child_text(elem, 'value', float),
534 minimum=_get_child_text(elem, 'minimum', float),
535 maximum=_get_child_text(elem, 'maximum', float),
536 )
537
543
544 - def domain(self, elem):
545 return PX.ProteinDomain(elem.text.strip(),
546 int(elem.get('from')) - 1,
547 int(elem.get('to')),
548 confidence=_float(elem.get('confidence')),
549 id=elem.get('id'))
550
551 - def domain_architecture(self, elem):
552 return PX.DomainArchitecture(
553 length=int(elem.get('length')),
554 domains=_get_children_as(elem, 'domain', self.domain))
555
557 return PX.Events(
558 type=_get_child_text(elem, 'type'),
559 duplications=_get_child_text(elem, 'duplications', int),
560 speciations=_get_child_text(elem, 'speciations', int),
561 losses=_get_child_text(elem, 'losses', int),
562 confidence=_get_child_as(elem, 'confidence', self.confidence))
563
564 - def id(self, elem):
565 provider = elem.get('provider') or elem.get('type')
566 return PX.Id(elem.text.strip(), provider)
567
569 is_aligned = elem.get('is_aligned')
570 if is_aligned is not None:
571 is_aligned = _str2bool(is_aligned)
572 return PX.MolSeq(elem.text.strip(), is_aligned=is_aligned)
573
575 return PX.Point(
576 elem.get('geodetic_datum'),
577 _get_child_text(elem, 'lat', float),
578 _get_child_text(elem, 'long', float),
579 alt=_get_child_text(elem, 'alt', float),
580 alt_unit=elem.get('alt_unit'))
581
585
587 return PX.Property(elem.text.strip(),
588 elem.get('ref'), elem.get('applies_to'), elem.get('datatype'),
589 unit=elem.get('unit'),
590 id_ref=elem.get('id_ref'))
591
596
602
603 - def uri(self, elem):
607
608
609
610
611
612
614 """Convert a Python primitive to a phyloXML-compatible Unicode string."""
615 if isinstance(value, float):
616 return unicode(value).upper()
617 elif isinstance(value, bool):
618 return unicode(value).lower()
619 return unicode(value)
620
621
623 """Create a dictionary from an object's specified, non-None attributes."""
624 out = {}
625 for key in attrs:
626 val = getattr(obj, key)
627 if val is not None:
628 out[key] = _serialize(val)
629 return out
630
631
633 def wrapped(self, obj):
634 elem = ElementTree.Element(tag, _clean_attrib(obj, attribs))
635 for subn in subnodes:
636 if isinstance(subn, basestring):
637
638 if getattr(obj, subn) is not None:
639 elem.append(getattr(self, subn)(getattr(obj, subn)))
640 else:
641
642 method, plural = subn
643 for item in getattr(obj, plural):
644 elem.append(getattr(self, method)(item))
645 if has_text:
646 elem.text = _serialize(obj.value)
647 return elem
648 wrapped.__doc__ = "Serialize a %s and its subnodes, in order." % tag
649 return wrapped
650
651
653 def wrapped(self, obj):
654 elem = ElementTree.Element(tag)
655 elem.text = _serialize(obj)
656 return elem
657 wrapped.__doc__ = "Serialize a simple %s node." % tag
658 return wrapped
659
660
662 """Methods for serializing a PhyloXML object to XML."""
663
665 """Build an ElementTree from a PhyloXML object."""
666 assert isinstance(phyloxml, PX.Phyloxml), "Not a Phyloxml object"
667 self._tree = ElementTree.ElementTree(self.phyloxml(phyloxml))
668
669 - def write(self, file, encoding='utf-8', indent=True):
674
675
676
678 elem = ElementTree.Element('phyloxml', obj.attributes)
679 for tree in obj.phylogenies:
680 elem.append(self.phylogeny(tree))
681 for otr in obj.other:
682 elem.append(self.other(otr))
683 return elem
684
686 elem = ElementTree.Element(_ns(obj.tag, obj.namespace), obj.attributes)
687 elem.text = obj.value
688 for child in obj.children:
689 elem.append(self.other(child))
690 return elem
691
692 phylogeny = _handle_complex('phylogeny',
693 ('rooted', 'rerootable', 'branch_length_unit', 'type'),
694 ( 'name',
695 'id',
696 'description',
697 'date',
698 ('confidence', 'confidences'),
699 'clade',
700 ('clade_relation', 'clade_relations'),
701 ('sequence_relation', 'sequence_relations'),
702 ('property', 'properties'),
703 ('other', 'other'),
704 ))
705
706 clade = _handle_complex('clade', ('id_source',),
707 ( 'name',
708 'branch_length',
709 ('confidence', 'confidences'),
710 'width',
711 'color',
712 'node_id',
713 ('taxonomy', 'taxonomies'),
714 ('sequence', 'sequences'),
715 'events',
716 'binary_characters',
717 ('distribution', 'distributions'),
718 'date',
719 ('reference', 'references'),
720 ('property', 'properties'),
721 ('clade', 'clades'),
722 ('other', 'other'),
723 ))
724
725 accession = _handle_complex('accession', ('source',),
726 (), has_text=True)
727
728 annotation = _handle_complex('annotation',
729 ('ref', 'source', 'evidence', 'type'),
730 ( 'desc',
731 'confidence',
732 ('property', 'properties'),
733 'uri',
734 ))
735
737 """Serialize a binary_characters node and its subnodes."""
738 elem = ElementTree.Element('binary_characters',
739 _clean_attrib(obj,
740 ('type', 'gained_count', 'lost_count',
741 'present_count', 'absent_count')))
742 for subn in ('gained', 'lost', 'present', 'absent'):
743 subelem = ElementTree.Element(subn)
744 for token in getattr(obj, subn):
745 subelem.append(self.bc(token))
746 elem.append(subelem)
747 return elem
748
749 clade_relation = _handle_complex('clade_relation',
750 ('id_ref_0', 'id_ref_1', 'distance', 'type'),
751 ('confidence',))
752
753 color = _handle_complex('color', (), ('red', 'green', 'blue'))
754
755 confidence = _handle_complex('confidence', ('type',),
756 (), has_text=True)
757
758 date = _handle_complex('date', ('unit',),
759 ('desc', 'value', 'minimum', 'maximum'))
760
761 distribution = _handle_complex('distribution', (),
762 ( 'desc',
763 ('point', 'points'),
764 ('polygon', 'polygons'),
765 ))
766
767 - def domain(self, obj):
768 """Serialize a domain node."""
769 elem = ElementTree.Element('domain',
770 {'from': str(obj.start + 1), 'to': str(obj.end)})
771 if obj.confidence is not None:
772 elem.set('confidence', _serialize(obj.confidence))
773 if obj.id is not None:
774 elem.set('id', obj.id)
775 elem.text = _serialize(obj.value)
776 return elem
777
778 domain_architecture = _handle_complex('domain_architecture',
779 ('length',),
780 (('domain', 'domains'),))
781
782 events = _handle_complex('events', (),
783 ( 'type',
784 'duplications',
785 'speciations',
786 'losses',
787 'confidence',
788 ))
789
790 id = _handle_complex('id', ('provider',), (), has_text=True)
791
792 mol_seq = _handle_complex('mol_seq', ('is_aligned',),
793 (), has_text=True)
794
795 node_id = _handle_complex('node_id', ('provider',), (), has_text=True)
796
797 point = _handle_complex('point', ('geodetic_datum', 'alt_unit'),
798 ('lat', 'long', 'alt'))
799
800 polygon = _handle_complex('polygon', (), (('point', 'points'),))
801
802 property = _handle_complex('property',
803 ('ref', 'unit', 'datatype', 'applies_to', 'id_ref'),
804 (), has_text=True)
805
806 reference = _handle_complex('reference', ('doi',), ('desc',))
807
808 sequence = _handle_complex('sequence',
809 ('type', 'id_ref', 'id_source'),
810 ( 'symbol',
811 'accession',
812 'name',
813 'location',
814 'mol_seq',
815 'uri',
816 ('annotation', 'annotations'),
817 'domain_architecture',
818 ('other', 'other'),
819 ))
820
821 sequence_relation = _handle_complex('sequence_relation',
822 ('id_ref_0', 'id_ref_1', 'distance', 'type'),
823 ('confidence',))
824
825 taxonomy = _handle_complex('taxonomy',
826 ('id_source',),
827 ( 'id',
828 'code',
829 'scientific_name',
830 'authority',
831 ('common_name', 'common_names'),
832 ('synonym', 'synonyms'),
833 'rank',
834 'uri',
835 ('other', 'other'),
836 ))
837
838 uri = _handle_complex('uri', ('desc', 'type'), (), has_text=True)
839
840
841
842
843 alt = _handle_simple('alt')
844 branch_length = _handle_simple('branch_length')
845 lat = _handle_simple('lat')
846 long = _handle_simple('long')
847 maximum = _handle_simple('maximum')
848 minimum = _handle_simple('minimum')
849 value = _handle_simple('value')
850 width = _handle_simple('width')
851
852
853 blue = _handle_simple('blue')
854 duplications = _handle_simple('duplications')
855 green = _handle_simple('green')
856 losses = _handle_simple('losses')
857 red = _handle_simple('red')
858 speciations = _handle_simple('speciations')
859
860
861 bc = _handle_simple('bc')
862 code = _handle_simple('code')
863 common_name = _handle_simple('common_name')
864 desc = _handle_simple('desc')
865 description = _handle_simple('description')
866 location = _handle_simple('location')
867 name = _handle_simple('name')
868 rank = _handle_simple('rank')
869 scientific_name = _handle_simple('scientific_name')
870 symbol = _handle_simple('symbol')
871 synonym = _handle_simple('synonym')
872 type = _handle_simple('type')
873