Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See Also 
   9  -------- 
  10  Official specification: 
  11     http://phyloxml.org/ 
  12  Journal article: 
  13      Han and Zmasek (2009), doi:10.1186/1471-2105-10-356 
  14   
  15  """ 
  16   
  17  import re 
  18  import sys 
  19  import warnings 
  20   
  21  from Bio._py3k import basestring 
  22   
  23  from Bio import Alphabet 
  24  from Bio.Align import MultipleSeqAlignment 
  25  from Bio.Seq import Seq 
  26  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  27  from Bio.SeqRecord import SeqRecord 
  28  from Bio import BiopythonWarning 
  29   
  30  from Bio.Phylo import BaseTree 
31 32 33 -class PhyloXMLWarning(BiopythonWarning):
34 """Warning for non-compliance with the phyloXML specification.""" 35 36 pass
37
38 39 -def _check_str(text, testfunc):
40 """Check a string using testfunc, and warn if there's no match.""" 41 if text is not None and not testfunc(text): 42 warnings.warn("String %s doesn't match the given regexp" % text, 43 PhyloXMLWarning, stacklevel=2)
44
45 46 # Core elements 47 48 -class PhyloElement(BaseTree.TreeElement):
49 """Base class for all PhyloXML objects."""
50
51 52 -class Phyloxml(PhyloElement):
53 """Root node of the PhyloXML document. 54 55 Contains an arbitrary number of Phylogeny elements, possibly followed by 56 elements from other namespaces. 57 58 :Parameters: 59 attributes : dict 60 (XML namespace definitions) 61 phylogenies : list 62 The phylogenetic trees 63 other : list 64 Arbitrary non-phyloXML elements, if any 65 66 """ 67
68 - def __init__(self, attributes, phylogenies=None, other=None):
69 """Initialize parameters for PhyloXML object.""" 70 self.attributes = { 71 # standard 72 "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", 73 "xmlns": "http://www.phyloxml.org", 74 "xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd", 75 } 76 if attributes: 77 self.attributes.update(attributes) 78 self.phylogenies = phylogenies or [] 79 self.other = other or []
80
81 - def __getitem__(self, index):
82 """Get a phylogeny by index or name.""" 83 if isinstance(index, (int, slice)): 84 return self.phylogenies[index] 85 if not isinstance(index, basestring): 86 raise KeyError("can't use %s as an index" % type(index)) 87 for tree in self.phylogenies: 88 if tree.name == index: 89 return tree 90 else: 91 raise KeyError("no phylogeny found with name " + repr(index))
92
93 - def __iter__(self):
94 """Iterate through the phylogenetic trees in this object.""" 95 return iter(self.phylogenies)
96
97 - def __len__(self):
98 """Number of phylogenetic trees in this object.""" 99 return len(self.phylogenies)
100
101 - def __str__(self):
102 """Return name of phylogenies in the object.""" 103 return '%s([%s])' % (self.__class__.__name__, 104 ',\n'.join(map(str, self.phylogenies)))
105
106 107 -class Other(PhyloElement):
108 """Container for non-phyloXML elements in the tree. 109 110 Usually, an Other object will have either a 'value' or a non-empty list 111 of 'children', but not both. This is not enforced here, though. 112 113 :Parameters: 114 tag : string 115 local tag for the XML node 116 namespace : string 117 XML namespace for the node -- should not be the default phyloXML 118 namespace. 119 attributes : dict of strings 120 attributes on the XML node 121 value : string 122 text contained directly within this XML node 123 children : list 124 child nodes, if any (also `Other` instances) 125 126 """ 127
128 - def __init__(self, tag, namespace=None, attributes=None, value=None, 129 children=None):
130 """Initialize values for non-phyloXML elements.""" 131 self.tag = tag 132 self.namespace = namespace 133 self.attributes = attributes or {} 134 self.value = value 135 self.children = children or []
136
137 - def __iter__(self):
138 """Iterate through the children of this object (if any).""" 139 return iter(self.children)
140
141 142 -class Phylogeny(PhyloElement, BaseTree.Tree):
143 """A phylogenetic tree. 144 145 :Parameters: 146 root : Clade 147 the root node/clade of this tree 148 rooted : bool 149 True if this tree is rooted 150 rerootable : bool 151 True if this tree is rerootable 152 branch_length_unit : string 153 unit for branch_length values on clades 154 name : string 155 identifier for this tree, not required to be unique 156 id : Id 157 unique identifier for this tree 158 description : string 159 plain-text description 160 date : Date 161 date for the root node of this tree 162 confidences : list 163 Confidence objects for this tree 164 clade_relations : list 165 CladeRelation objects 166 sequence_relations : list 167 SequenceRelation objects 168 properties : list 169 Property objects 170 other : list 171 non-phyloXML elements (type `Other`) 172 173 """ 174
175 - def __init__(self, root=None, rooted=True, 176 rerootable=None, branch_length_unit=None, type=None, 177 # Child nodes 178 name=None, id=None, description=None, date=None, 179 # Collections 180 confidences=None, clade_relations=None, sequence_relations=None, 181 properties=None, other=None, 182 ):
183 """Initialize values for phylogenetic tree object.""" 184 assert isinstance(rooted, bool) 185 self.root = root 186 self.rooted = rooted 187 self.rerootable = rerootable 188 self.branch_length_unit = branch_length_unit 189 self.type = type 190 self.name = name 191 self.id = id 192 self.description = description 193 self.date = date 194 self.confidences = confidences or [] 195 self.clade_relations = clade_relations or [] 196 self.sequence_relations = sequence_relations or [] 197 self.properties = properties or [] 198 self.other = other or []
199 200 @classmethod
201 - def from_tree(cls, tree, **kwargs):
202 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 203 204 Keyword arguments are the usual `Phylogeny` constructor parameters. 205 """ 206 phy = cls( 207 root=Clade.from_clade(tree.root), 208 rooted=tree.rooted, 209 name=tree.name, 210 id=(tree.id is not None) and Id(str(tree.id)) or None) 211 phy.__dict__.update(kwargs) 212 return phy
213 214 @classmethod
215 - def from_clade(cls, clade, **kwargs):
216 """Create a new Phylogeny given a Newick or BaseTree Clade object. 217 218 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters. 219 """ 220 return Clade.from_clade(clade).to_phylogeny(**kwargs)
221
222 - def as_phyloxml(self):
223 """Return this tree, a PhyloXML-compatible Phylogeny object. 224 225 Overrides the `BaseTree` method. 226 """ 227 return self
228
229 - def to_phyloxml_container(self, **kwargs):
230 """Create a new Phyloxml object containing just this phylogeny.""" 231 return Phyloxml(kwargs, phylogenies=[self])
232
233 - def to_alignment(self):
234 """Construct an alignment from the aligned sequences in this tree.""" 235 def is_aligned_seq(elem): 236 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 237 return True 238 return False
239 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 240 try: 241 first_seq = next(seqs) 242 except StopIteration: 243 # No aligned sequences were found --> empty MSA 244 return MultipleSeqAlignment([]) 245 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 246 first_seq.get_alphabet()) 247 msa.extend(seq.to_seqrecord() for seq in seqs) 248 return msa
249 250 # Singular property for plural attribute
251 - def _get_confidence(self):
252 """Equivalent to self.confidences[0] if there is only 1 value. 253 254 See Also: `Clade.confidence`, `Clade.taxonomy` 255 256 """ 257 if len(self.confidences) == 0: 258 return None 259 if len(self.confidences) > 1: 260 raise AttributeError("more than 1 confidence value available; " 261 "use Phylogeny.confidences") 262 return self.confidences[0]
263
264 - def _set_confidence(self, value):
265 if value is None: 266 # Special case: mirror the behavior of _get_confidence 267 self.confidences = [] 268 return 269 if isinstance(value, (float, int)): 270 value = Confidence(value) 271 elif not isinstance(value, Confidence): 272 raise ValueError("value must be a number or Confidence instance") 273 if len(self.confidences) == 0: 274 self.confidences.append(value) 275 elif len(self.confidences) == 1: 276 self.confidences[0] = value 277 else: 278 raise ValueError("multiple confidence values already exist; " 279 "use Phylogeny.confidences instead")
280
281 - def _del_confidence(self):
282 self.confidences = []
283 284 confidence = property(_get_confidence, _set_confidence, _del_confidence) 285
286 287 -class Clade(PhyloElement, BaseTree.Clade):
288 """Describes a branch of the current phylogenetic tree. 289 290 Used recursively, describes the topology of a phylogenetic tree. 291 292 Both ``color`` and ``width`` elements should be interpreted by client code 293 as applying to the whole clade, including all descendents, unless 294 overwritten in-sub clades. This module doesn't automatically assign these 295 attributes to sub-clades to achieve this cascade -- and neither should you. 296 297 :Parameters: 298 branch_length 299 parent branch length of this clade 300 id_source 301 link other elements to a clade (on the xml-level) 302 name : string 303 short label for this clade 304 confidences : list of Confidence objects 305 used to indicate the support for a clade/parent branch. 306 width : float 307 branch width for this clade (including branch from parent) 308 color : BranchColor 309 color used for graphical display of this clade 310 node_id 311 unique identifier for the root node of this clade 312 taxonomies : list 313 Taxonomy objects 314 sequences : list 315 Sequence objects 316 events : Events 317 describe such events as gene-duplications at the root node/parent 318 branch of this clade 319 binary_characters : BinaryCharacters 320 binary characters 321 distributions : list of Distribution objects 322 distribution(s) of this clade 323 date : Date 324 a date for the root node of this clade 325 references : list 326 Reference objects 327 properties : list 328 Property objects 329 clades : list Clade objects 330 Sub-clades 331 other : list of Other objects 332 non-phyloXML objects 333 334 """ 335
336 - def __init__(self, 337 # Attributes 338 branch_length=None, id_source=None, 339 # Child nodes 340 name=None, width=None, color=None, node_id=None, events=None, 341 binary_characters=None, date=None, 342 # Collections 343 confidences=None, taxonomies=None, sequences=None, 344 distributions=None, references=None, properties=None, clades=None, 345 other=None, 346 ):
347 """Initialize value for the Clade object.""" 348 self.branch_length = branch_length 349 self.id_source = id_source 350 self.name = name 351 self.width = width 352 self.color = color 353 self.node_id = node_id 354 self.events = events 355 self.binary_characters = binary_characters 356 self.date = date 357 self.confidences = confidences or [] 358 self.taxonomies = taxonomies or [] 359 self.sequences = sequences or [] 360 self.distributions = distributions or [] 361 self.references = references or [] 362 self.properties = properties or [] 363 self.clades = clades or [] 364 self.other = other or []
365 366 @classmethod
367 - def from_clade(cls, clade, **kwargs):
368 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 369 370 Keyword arguments are the usual PhyloXML Clade constructor parameters. 371 """ 372 new_clade = cls(branch_length=clade.branch_length, 373 name=clade.name) 374 new_clade.clades = [cls.from_clade(c) for c in clade] 375 new_clade.confidence = clade.confidence 376 new_clade.width = clade.width 377 new_clade.color = (BranchColor( 378 clade.color.red, clade.color.green, clade.color.blue) 379 if clade.color else None) 380 new_clade.__dict__.update(kwargs) 381 return new_clade
382
383 - def to_phylogeny(self, **kwargs):
384 """Create a new phylogeny containing just this clade.""" 385 phy = Phylogeny(root=self, date=self.date) 386 phy.__dict__.update(kwargs) 387 return phy
388 389 # Shortcuts for list attributes that are usually only 1 item 390 # NB: Duplicated from Phylogeny class
391 - def _get_confidence(self):
392 if len(self.confidences) == 0: 393 return None 394 if len(self.confidences) > 1: 395 raise AttributeError("more than 1 confidence value available; " 396 "use Clade.confidences") 397 return self.confidences[0]
398
399 - def _set_confidence(self, value):
400 if value is None: 401 # Special case: mirror the behavior of _get_confidence 402 self.confidences = [] 403 return 404 if isinstance(value, (float, int)): 405 value = Confidence(value) 406 elif not isinstance(value, Confidence): 407 raise ValueError("value must be a number or Confidence instance") 408 if len(self.confidences) == 0: 409 self.confidences.append(value) 410 elif len(self.confidences) == 1: 411 self.confidences[0] = value 412 else: 413 raise ValueError("multiple confidence values already exist; " 414 "use Phylogeny.confidences instead")
415
416 - def _del_confidence(self):
417 self.confidences = []
418 419 confidence = property(_get_confidence, _set_confidence, _del_confidence) 420
421 - def _get_taxonomy(self):
422 if len(self.taxonomies) == 0: 423 return None 424 if len(self.taxonomies) > 1: 425 raise AttributeError("more than 1 taxonomy value available; " 426 "use Clade.taxonomies") 427 return self.taxonomies[0]
428
429 - def _set_taxonomy(self, value):
430 if not isinstance(value, Taxonomy): 431 raise ValueError("assigned value must be a Taxonomy instance") 432 if len(self.taxonomies) == 0: 433 self.taxonomies.append(value) 434 elif len(self.taxonomies) == 1: 435 self.taxonomies[0] = value 436 else: 437 raise ValueError("multiple taxonomy values already exist; " 438 "use Phylogeny.taxonomies instead")
439 440 taxonomy = property(_get_taxonomy, _set_taxonomy)
441
442 443 # PhyloXML wrapper for a special BaseTree attribute 444 445 -class BranchColor(PhyloElement, BaseTree.BranchColor):
446
447 - def __init__(self, *args, **kwargs):
448 """Initialize parameters for the BranchColor object.""" 449 BaseTree.BranchColor.__init__(self, *args, **kwargs)
450
451 452 # PhyloXML-specific complex types 453 454 -class Accession(PhyloElement):
455 """Captures the local part in a sequence identifier. 456 457 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` 458 is 'P17304' and the ``source`` attribute is 'UniProtKB'. 459 """ 460
461 - def __init__(self, value, source):
462 """Initialize value for Accession object.""" 463 self.value = value 464 self.source = source
465
466 - def __str__(self):
467 """Show the class name and an identifying attribute.""" 468 return '%s:%s' % (self.source, self.value)
469
470 471 -class Annotation(PhyloElement):
472 """The annotation of a molecular sequence. 473 474 It is recommended to annotate by using the optional 'ref' attribute. 475 476 :Parameters: 477 ref : string 478 reference string, e.g. 'GO:0008270', 479 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' 480 source : string 481 plain-text source for this annotation 482 evidence : str 483 describe evidence as free text (e.g. 'experimental') 484 desc : string 485 free text description 486 confidence : Confidence 487 state the type and value of support (type Confidence) 488 properties : list 489 typed and referenced annotations from external resources 490 uri : Uri 491 link 492 493 """ 494 495 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 496
497 - def __init__(self, 498 # Attributes 499 ref=None, source=None, evidence=None, type=None, 500 # Child nodes 501 desc=None, confidence=None, uri=None, 502 # Collection 503 properties=None):
504 """Initialize value for the Annotation object.""" 505 _check_str(ref, self.re_ref.match) 506 self.ref = ref 507 self.source = source 508 self.evidence = evidence 509 self.type = type 510 self.desc = desc 511 self.confidence = confidence 512 self.uri = uri 513 self.properties = properties or []
514
515 516 -class BinaryCharacters(PhyloElement):
517 """Binary characters at the root of a clade. 518 519 The names and/or counts of binary characters present, gained, and lost 520 at the root of a clade. 521 """ 522
523 - def __init__(self, 524 # Attributes 525 type=None, gained_count=None, lost_count=None, present_count=None, 526 absent_count=None, 527 # Child nodes (flattened into collections) 528 gained=None, lost=None, present=None, absent=None):
529 """Initialize values for the BinaryCharacters object.""" 530 self.type = type 531 self.gained_count = gained_count 532 self.lost_count = lost_count 533 self.present_count = present_count 534 self.absent_count = absent_count 535 self.gained = gained or [] 536 self.lost = lost or [] 537 self.present = present or [] 538 self.absent = absent or []
539
540 541 -class CladeRelation(PhyloElement):
542 """Expresses a typed relationship between two clades. 543 544 For example, this could be used to describe multiple parents of a clade. 545 546 @type id_ref_0: str 547 @type id_ref_1: str 548 @type distance: str 549 @type type: str 550 551 @type confidence: Confidence 552 """ 553
554 - def __init__(self, type, id_ref_0, id_ref_1, 555 distance=None, confidence=None):
556 """Initialize values for the CladeRelation object.""" 557 self.distance = distance 558 self.type = type 559 self.id_ref_0 = id_ref_0 560 self.id_ref_1 = id_ref_1 561 self.confidence = confidence
562
563 564 -class Confidence(PhyloElement):
565 """A general purpose confidence element. 566 567 For example, this can be used to express the bootstrap support value of a 568 clade (in which case the `type` attribute is 'bootstrap'). 569 570 :Parameters: 571 value : float 572 confidence value 573 type : string 574 label for the type of confidence, e.g. 'bootstrap' 575 576 """ 577
578 - def __init__(self, value, type='unknown'):
579 """Initialize values for the Confidence object.""" 580 self.value = value 581 self.type = type
582 583 # Comparison operators 584
585 - def __hash__(self):
586 """Return the hash value of the object. 587 588 Hash values are integers. They are used to quickly compare dictionary 589 keys during a dictionary lookup. Numeric values that compare equal have 590 the same hash value (even if they are of different types, as is the 591 case for 1 and 1.0). 592 """ 593 return id(self)
594
595 - def __eq__(self, other):
596 """Checks for equality between Confidence objects.""" 597 if isinstance(other, Confidence): 598 return self.value == other.value 599 return self.value == other
600
601 - def __ne__(self, other):
602 """Checks for inequality between two Confidence objects.""" 603 if isinstance(other, Confidence): 604 return self.value != other.value 605 return self.value != other
606 607 # Ordering -- see functools.total_ordering in Py2.7 608
609 - def __lt__(self, other):
610 if isinstance(other, Confidence): 611 return self.value < other.value 612 return self.value < other
613
614 - def __le__(self, other):
615 return self < other or self == other
616
617 - def __gt__(self, other):
618 return not (self <= other)
619
620 - def __ge__(self, other):
621 return not (self.value < other)
622 623 # Arithmetic operators, including reverse 624
625 - def __add__(self, other):
626 """Conducts additions between value of two Confidence objects.""" 627 return self.value + other
628
629 - def __radd__(self, other):
630 return other + self.value
631
632 - def __sub__(self, other):
633 """Conduct subtraction between value of two Confidence objects.""" 634 return self.value - other
635
636 - def __rsub__(self, other):
637 return other - self.value
638
639 - def __mul__(self, other):
640 """Conduct multiplication between value of two Confidence objects.""" 641 return self.value * other
642
643 - def __rmul__(self, other):
644 return other * self.value
645
646 - def __div__(self, other):
647 """Conduct division between value of two Confidence objects.""" 648 return self.value.__div__(other)
649
650 - def __rdiv__(self, other):
651 return other.__div__(self.value)
652
653 - def __truediv__(self, other):
654 """Rational-style division in Py3.0+. 655 656 Also active in Py2.5+ with __future__.division import. 657 """ 658 return self.value / other
659
660 - def __rtruediv__(self, other):
661 return other / self.value
662
663 - def __floordiv__(self, other):
664 """C-style and old-style division in Py3.0+. 665 666 Also active in Py2.5+ with __future__.division import. 667 """ 668 return self.value.__floordiv__(other)
669
670 - def __rfloordiv__(self, other):
671 return other.__floordiv__(self.value)
672
673 - def __mod__(self, other):
674 """Conducts modulus between value of two Confidence objects.""" 675 return self.value % other
676
677 - def __rmod__(self, other):
678 return other % self.value
679
680 - def __divmod__(self, other):
681 return divmod(self.value, other)
682
683 - def __rdivmod__(self, other):
684 return divmod(other, self.value)
685
686 - def __pow__(self, other, modulo=None):
687 if modulo is not None: 688 return pow(self.value, other, modulo) 689 return pow(self.value, other)
690
691 - def __rpow__(self, other):
692 return pow(other, self.value)
693 694 # Unary arithmetic operations: -, +, abs() 695
696 - def __neg__(self):
697 """Conducts negation of a Confidence object.""" 698 return -self.value
699
700 - def __pos__(self):
701 return self.value
702
703 - def __abs__(self):
704 """Return absolute value of Confidence object.""" 705 return abs(self.value)
706 707 # Explicit coercion to numeric types: float, int 708 # (and under Python 2 only long) 709
710 - def __float__(self):
711 """Returns float value of Confidence object.""" 712 return float(self.value)
713
714 - def __int__(self):
715 """Returns integer value of Confidence object.""" 716 return int(self.value)
717 718 if sys.version_info[0] < 3:
719 - def __long__(self):
720 """Returns long value of Confidence object.""" 721 return long(self.value) # noqa : F821
722
723 724 -class Date(PhyloElement):
725 """A date associated with a clade/node. 726 727 Its value can be numerical by using the 'value' element and/or free text 728 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 729 is recommended to employ the 'unit' attribute. 730 731 :Parameters: 732 unit : string 733 type of numerical value (e.g. 'mya' for 'million years ago') 734 value : float 735 the date value 736 desc : string 737 plain-text description of the date 738 minimum : float 739 lower bound on the date value 740 maximum : float 741 upper bound on the date value 742 743 """ 744
745 - def __init__(self, value=None, unit=None, desc=None, 746 minimum=None, maximum=None):
747 """Initialize values of the Date object.""" 748 self.value = value 749 self.unit = unit 750 self.desc = desc 751 self.minimum = minimum 752 self.maximum = maximum
753
754 - def __str__(self):
755 """Show the class name and the human-readable date.""" 756 if self.unit and self.value is not None: 757 return '%s %s' % (self.value, self.unit) 758 if self.desc is not None: 759 return self.desc 760 return self.__class__.__name__
761
762 763 -class Distribution(PhyloElement):
764 """Geographic distribution of the items of a clade (species, sequences). 765 766 Intended for phylogeographic applications. 767 768 :Parameters: 769 desc : string 770 free-text description of the location 771 points : list of `Point` objects 772 coordinates (similar to the 'Point' element in Google's KML format) 773 polygons : list of `Polygon` objects 774 coordinate sets defining geographic regions 775 776 """ 777
778 - def __init__(self, desc=None, points=None, polygons=None):
779 """Initialize values of Distribution object.""" 780 self.desc = desc 781 self.points = points or [] 782 self.polygons = polygons or []
783
784 785 -class DomainArchitecture(PhyloElement):
786 """Domain architecture of a protein. 787 788 :Parameters: 789 length : int 790 total length of the protein sequence 791 domains : list ProteinDomain objects 792 the domains within this protein 793 794 """ 795
796 - def __init__(self, length=None, domains=None):
797 """Initialize values of the DomainArchitecture object.""" 798 self.length = length 799 self.domains = domains
800
801 802 -class Events(PhyloElement):
803 """Events at the root node of a clade (e.g. one gene duplication). 804 805 All attributes are set to None by default, but this object can also be 806 treated as a dictionary, in which case None values are treated as missing 807 keys and deleting a key resets that attribute's value back to None. 808 """ 809 810 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 811 'mixed', 'unassigned')) 812
813 - def __init__(self, type=None, duplications=None, speciations=None, 814 losses=None, confidence=None):
815 """Initialize values of the Events object.""" 816 _check_str(type, self.ok_type.__contains__) 817 self.type = type 818 self.duplications = duplications 819 self.speciations = speciations 820 self.losses = losses 821 self.confidence = confidence
822
823 - def items(self):
824 return [(k, v) for k, v in self.__dict__.items() if v is not None]
825
826 - def keys(self):
827 return [k for k, v in self.__dict__.items() if v is not None]
828
829 - def values(self):
830 """Returns values from a key-value pair in an Events dict.""" 831 return [v for v in self.__dict__.values() if v is not None]
832
833 - def __len__(self):
834 # TODO - Better way to do this? 835 return len(self.values())
836
837 - def __getitem__(self, key):
838 if not hasattr(self, key): 839 raise KeyError(key) 840 val = getattr(self, key) 841 if val is None: 842 raise KeyError("%s has not been set in this object" % repr(key)) 843 return val
844
845 - def __setitem__(self, key, val):
846 setattr(self, key, val)
847
848 - def __delitem__(self, key):
849 setattr(self, key, None)
850
851 - def __iter__(self):
852 """Iterates over the keys present in a Events dict.""" 853 return iter(self.keys())
854
855 - def __contains__(self, key):
856 return (hasattr(self, key) and getattr(self, key) is not None)
857
858 859 -class Id(PhyloElement):
860 """A general-purpose identifier element. 861 862 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 863 along with the value itself. 864 """ 865
866 - def __init__(self, value, provider=None):
867 """Initialize vakues for the identifier object.""" 868 self.value = value 869 self.provider = provider
870
871 - def __str__(self):
872 if self.provider is not None: 873 return '%s:%s' % (self.provider, self.value) 874 return self.value
875
876 877 -class MolSeq(PhyloElement):
878 """Store a molecular sequence. 879 880 :Parameters: 881 value : string 882 the sequence itself 883 is_aligned : bool 884 True if this sequence is aligned with the others (usually meaning 885 all aligned seqs are the same length and gaps may be present) 886 887 """ 888 889 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 890
891 - def __init__(self, value, is_aligned=None):
892 """Initialize parameters for the MolSeq object.""" 893 _check_str(value, self.re_value.match) 894 self.value = value 895 self.is_aligned = is_aligned
896
897 - def __str__(self):
898 """Returns the value of the Molecular Sequence object.""" 899 return self.value
900
901 902 -class Point(PhyloElement):
903 """Geographic coordinates of a point, with an optional altitude. 904 905 Used by element 'Distribution'. 906 907 :Parameters: 908 geodetic_datum : string, required 909 the geodetic datum (also called 'map datum'). For example, Google's 910 KML uses 'WGS84'. 911 lat : numeric 912 latitude 913 long : numeric 914 longitude 915 alt : numeric 916 altitude 917 alt_unit : string 918 unit for the altitude (e.g. 'meter') 919 920 """ 921
922 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
923 """Initialize value for the Point object.""" 924 self.geodetic_datum = geodetic_datum 925 self.lat = lat 926 self.long = long 927 self.alt = alt 928 self.alt_unit = alt_unit
929
930 931 -class Polygon(PhyloElement):
932 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 933 934 :param points: list of 3 or more points representing vertices. 935 936 """ 937
938 - def __init__(self, points=None):
939 """Initialize value for the Polygon object.""" 940 self.points = points or []
941
942 - def __str__(self):
943 return '%s([%s])' % (self.__class__.__name__, 944 ',\n'.join(map(str, self.points)))
945
946 947 -class Property(PhyloElement):
948 """A typed and referenced property from an external resources. 949 950 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects. 951 952 :Parameters: 953 value : string 954 the value of the property 955 ref : string 956 reference to an external resource, e.g. "NOAA:depth" 957 applies_to : string 958 indicates the item to which a property applies to (e.g. 'node' for 959 the parent node of a clade, 'parent_branch' for the parent branch of 960 a clade, or just 'clade'). 961 datatype : string 962 the type of a property; limited to xsd-datatypes 963 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', 964 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 965 unit : string (optional) 966 the unit of the property, e.g. "METRIC:m" 967 id_ref : Id (optional) 968 allows to attached a property specifically to one element (on the 969 xml-level) 970 971 """ 972 973 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 974 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 975 'parent_branch', 'other')) 976 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 977 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 978 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 979 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 980 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 981 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 982 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 983 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 984 'xsd:positiveInteger')) 985
986 - def __init__(self, value, ref, applies_to, datatype, 987 unit=None, id_ref=None):
988 """Initialize value for the Property object.""" 989 _check_str(ref, self.re_ref.match) 990 _check_str(applies_to, self.ok_applies_to.__contains__) 991 _check_str(datatype, self.ok_datatype.__contains__) 992 _check_str(unit, self.re_ref.match) 993 self.unit = unit 994 self.id_ref = id_ref 995 self.value = value 996 self.ref = ref 997 self.applies_to = applies_to 998 self.datatype = datatype
999
1000 1001 -class ProteinDomain(PhyloElement):
1002 """Represents an individual domain in a domain architecture. 1003 1004 The locations use 0-based indexing, as most Python objects including 1005 SeqFeature do, rather than the usual biological convention starting at 1. 1006 This means the start and end attributes can be used directly as slice 1007 indexes on Seq objects. 1008 1009 :Parameters: 1010 start : non-negative integer 1011 start of the domain on the sequence, using 0-based indexing 1012 end : non-negative integer 1013 end of the domain on the sequence 1014 confidence : float 1015 can be used to store e.g. E-values 1016 id : string 1017 unique identifier/name 1018 1019 """ 1020
1021 - def __init__(self, value, start, end, confidence=None, id=None):
1022 """Initialize value for a ProteinDomain object.""" 1023 self.value = value 1024 self.start = start 1025 self.end = end 1026 self.confidence = confidence 1027 self.id = id
1028 1029 @classmethod
1030 - def from_seqfeature(cls, feat):
1031 return ProteinDomain(feat.id, 1032 feat.location.nofuzzy_start, 1033 feat.location.nofuzzy_end, 1034 confidence=feat.qualifiers.get('confidence'))
1035
1036 - def to_seqfeature(self):
1037 """Create a SeqFeature from the ProteinDomain Object.""" 1038 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 1039 id=self.value) 1040 if hasattr(self, 'confidence'): 1041 feat.qualifiers['confidence'] = self.confidence 1042 return feat
1043
1044 1045 -class Reference(PhyloElement):
1046 """Literature reference for a clade. 1047 1048 NB: Whenever possible, use the ``doi`` attribute instead of the free-text 1049 ``desc`` element. 1050 """ 1051 1052 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 1053
1054 - def __init__(self, doi=None, desc=None):
1055 """Initialize elements of the Reference class object.""" 1056 _check_str(doi, self.re_doi.match) 1057 self.doi = doi 1058 self.desc = desc
1059
1060 1061 -class Sequence(PhyloElement):
1062 """A molecular sequence (Protein, DNA, RNA) associated with a node. 1063 1064 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the 1065 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per 1066 node. 1067 1068 :Parameters: 1069 type : {'dna', 'rna', 'protein'} 1070 type of molecule this sequence represents 1071 id_ref : string 1072 reference to another resource 1073 id_source : string 1074 source for the reference 1075 symbol : string 1076 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 1077 accession : Accession 1078 accession code for this sequence. 1079 name : string 1080 full name of the sequence, e.g. 'muscle Actin' 1081 location 1082 location of a sequence on a genome/chromosome. 1083 mol_seq : MolSeq 1084 the molecular sequence itself 1085 uri : Uri 1086 link 1087 annotations : list of Annotation objects 1088 annotations on this sequence 1089 domain_architecture : DomainArchitecture 1090 protein domains on this sequence 1091 other : list of Other objects 1092 non-phyloXML elements 1093 1094 """ 1095 1096 alphabets = {'dna': Alphabet.generic_dna, 1097 'rna': Alphabet.generic_rna, 1098 'protein': Alphabet.generic_protein} 1099 re_symbol = re.compile(r'\S{1,10}') 1100
1101 - def __init__(self, 1102 # Attributes 1103 type=None, id_ref=None, id_source=None, 1104 # Child nodes 1105 symbol=None, accession=None, name=None, location=None, 1106 mol_seq=None, uri=None, domain_architecture=None, 1107 # Collections 1108 annotations=None, other=None, 1109 ):
1110 """Initialize value for a Sequence object.""" 1111 _check_str(type, self.alphabets.__contains__) 1112 _check_str(symbol, self.re_symbol.match) 1113 self.type = type 1114 self.id_ref = id_ref 1115 self.id_source = id_source 1116 self.symbol = symbol 1117 self.accession = accession 1118 self.name = name 1119 self.location = location 1120 self.mol_seq = mol_seq 1121 self.uri = uri 1122 self.domain_architecture = domain_architecture 1123 self.annotations = annotations or [] 1124 self.other = other or []
1125 1126 @classmethod
1127 - def from_seqrecord(cls, record, is_aligned=None):
1128 """Create a new PhyloXML Sequence from a SeqRecord object.""" 1129 if is_aligned is None: 1130 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 1131 params = { 1132 'accession': Accession(record.id, ''), 1133 'symbol': record.name, 1134 'name': record.description, 1135 'mol_seq': MolSeq(str(record.seq), is_aligned), 1136 } 1137 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 1138 params['type'] = 'dna' 1139 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 1140 params['type'] = 'rna' 1141 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 1142 params['type'] = 'protein' 1143 1144 # Unpack record.annotations 1145 for key in ('id_ref', 'id_source', 'location'): 1146 if key in record.annotations: 1147 params[key] = record.annotations[key] 1148 if isinstance(record.annotations.get('uri'), dict): 1149 params['uri'] = Uri(**record.annotations['uri']) 1150 # Build a Sequence.annotation object 1151 if record.annotations.get('annotations'): 1152 params['annotations'] = [] 1153 for annot in record.annotations['annotations']: 1154 ann_args = {} 1155 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 1156 if key in annot: 1157 ann_args[key] = annot[key] 1158 if isinstance(annot.get('confidence'), list): 1159 ann_args['confidence'] = Confidence( 1160 *annot['confidence']) 1161 if isinstance(annot.get('properties'), list): 1162 ann_args['properties'] = [Property(**prop) 1163 for prop in annot['properties'] 1164 if isinstance(prop, dict)] 1165 params['annotations'].append(Annotation(**ann_args)) 1166 1167 # Unpack record.features 1168 if record.features: 1169 params['domain_architecture'] = DomainArchitecture( 1170 length=len(record.seq), 1171 domains=[ProteinDomain.from_seqfeature(feat) 1172 for feat in record.features]) 1173 1174 return Sequence(**params)
1175
1176 - def to_seqrecord(self):
1177 """Create a SeqRecord object from this Sequence instance. 1178 1179 The seqrecord.annotations dictionary is packed like so:: 1180 1181 { # Sequence attributes with no SeqRecord equivalent: 1182 'id_ref': self.id_ref, 1183 'id_source': self.id_source, 1184 'location': self.location, 1185 'uri': { 'value': self.uri.value, 1186 'desc': self.uri.desc, 1187 'type': self.uri.type }, 1188 # Sequence.annotations attribute (list of Annotations) 1189 'annotations': [{'ref': ann.ref, 1190 'source': ann.source, 1191 'evidence': ann.evidence, 1192 'type': ann.type, 1193 'confidence': [ann.confidence.value, 1194 ann.confidence.type], 1195 'properties': [{'value': prop.value, 1196 'ref': prop.ref, 1197 'applies_to': prop.applies_to, 1198 'datatype': prop.datatype, 1199 'unit': prop.unit, 1200 'id_ref': prop.id_ref} 1201 for prop in ann.properties], 1202 } for ann in self.annotations], 1203 } 1204 1205 """ 1206 def clean_dict(dct): 1207 """Remove None-valued items from a dictionary.""" 1208 return dict((key, val) for key, val in dct.items() 1209 if val is not None)
1210 1211 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1212 **clean_dict({ 1213 'id': str(self.accession), 1214 'name': self.symbol, 1215 'description': self.name, 1216 # 'dbxrefs': None, 1217 })) 1218 if self.domain_architecture: 1219 seqrec.features = [dom.to_seqfeature() 1220 for dom in self.domain_architecture.domains] 1221 # Sequence attributes with no SeqRecord equivalent 1222 seqrec.annotations = clean_dict({ 1223 'id_ref': self.id_ref, 1224 'id_source': self.id_source, 1225 'location': self.location, 1226 'uri': self.uri and clean_dict({ 1227 'value': self.uri.value, 1228 'desc': self.uri.desc, 1229 'type': self.uri.type, 1230 }), 1231 'annotations': self.annotations and [ 1232 clean_dict({ 1233 'ref': ann.ref, 1234 'source': ann.source, 1235 'evidence': ann.evidence, 1236 'type': ann.type, 1237 'confidence': ann.confidence and [ 1238 ann.confidence.value, 1239 ann.confidence.type], 1240 'properties': [clean_dict({ 1241 'value': prop.value, 1242 'ref': prop.ref, 1243 'applies_to': prop.applies_to, 1244 'datatype': prop.datatype, 1245 'unit': prop.unit, 1246 'id_ref': prop.id_ref}) 1247 for prop in ann.properties], 1248 }) for ann in self.annotations], 1249 }) 1250 return seqrec
1251
1252 - def get_alphabet(self):
1253 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1254 if self.mol_seq and self.mol_seq.is_aligned: 1255 return Alphabet.Gapped(alph) 1256 return alph
1257
1258 1259 -class SequenceRelation(PhyloElement):
1260 """Express a typed relationship between two sequences. 1261 1262 For example, this could be used to describe an orthology (in which case 1263 attribute 'type' is 'orthology'). 1264 1265 :Parameters: 1266 id_ref_0 : Id 1267 first sequence reference identifier 1268 id_ref_1 : Id 1269 second sequence reference identifier 1270 distance : float 1271 distance between the two sequences 1272 type : restricted string 1273 describe the type of relationship 1274 confidence : Confidence 1275 confidence value for this relation 1276 1277 """ 1278 1279 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1280 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1281
1282 - def __init__(self, type, id_ref_0, id_ref_1, 1283 distance=None, confidence=None):
1284 _check_str(type, self.ok_type.__contains__) 1285 self.distance = distance 1286 self.type = type 1287 self.id_ref_0 = id_ref_0 1288 self.id_ref_1 = id_ref_1 1289 self.confidence = confidence
1290
1291 1292 -class Taxonomy(PhyloElement):
1293 """Describe taxonomic information for a clade. 1294 1295 :Parameters: 1296 id_source : Id 1297 link other elements to a taxonomy (on the XML level) 1298 id : Id 1299 unique identifier of a taxon, e.g. Id('6500', 1300 provider='ncbi_taxonomy') for the California sea hare 1301 code : restricted string 1302 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the 1303 California sea hare 'Aplysia californica' 1304 scientific_name : string 1305 the standard scientific name for this organism, e.g. 'Aplysia 1306 californica' for the California sea hare 1307 authority : string 1308 keep the authority, such as 'J. G. Cooper, 1863', associated with 1309 the 'scientific_name' 1310 common_names : list of strings 1311 common names for this organism 1312 synonyms : list of strings 1313 synonyms for this taxon? 1314 rank : restricted string 1315 taxonomic rank 1316 uri : Uri 1317 link 1318 other : list of Other objects 1319 non-phyloXML elements 1320 1321 """ 1322 1323 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1324 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1325 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1326 'superdivision', 'division', 'subdivision', 'infradivision', 1327 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1328 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1329 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1330 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1331 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1332 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1333 'unknown', 'other')) 1334
1335 - def __init__(self, 1336 # Attributes 1337 id_source=None, 1338 # Child nodes 1339 id=None, code=None, scientific_name=None, authority=None, 1340 rank=None, uri=None, 1341 # Collections 1342 common_names=None, synonyms=None, other=None, 1343 ):
1344 _check_str(code, self.re_code.match) 1345 _check_str(rank, self.ok_rank.__contains__) 1346 self.id_source = id_source 1347 self.id = id 1348 self.code = code 1349 self.scientific_name = scientific_name 1350 self.authority = authority 1351 self.rank = rank 1352 self.uri = uri 1353 self.common_names = common_names or [] 1354 self.synonyms = synonyms or [] 1355 self.other = other or []
1356
1357 - def __str__(self):
1358 """Show the class name and an identifying attribute.""" 1359 if self.code is not None: 1360 return self.code 1361 if self.scientific_name is not None: 1362 return self.scientific_name 1363 if self.rank is not None: 1364 return self.rank 1365 if self.id is not None: 1366 return str(self.id) 1367 return self.__class__.__name__
1368
1369 1370 -class Uri(PhyloElement):
1371 """A uniform resource identifier. 1372 1373 In general, this is expected to be an URL (for example, to link to an image 1374 on a website, in which case the ``type`` attribute might be 'image' and 1375 ``desc`` might be 'image of a California sea hare'). 1376 """ 1377
1378 - def __init__(self, value, desc=None, type=None):
1379 self.value = value 1380 self.desc = desc 1381 self.type = type
1382
1383 - def __str__(self):
1384 if self.value: 1385 return self.value 1386 return repr(self)
1387