Package Bio :: Package Phylo :: Module PhyloXML
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PhyloXML

   1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Classes corresponding to phyloXML elements. 
   7   
   8  See Also 
   9  -------- 
  10  Official specification: 
  11     http://phyloxml.org/ 
  12  Journal article: 
  13      Han and Zmasek (2009), doi:10.1186/1471-2105-10-356 
  14  """ 
  15   
  16  import re 
  17  import warnings 
  18   
  19  from Bio._py3k import basestring 
  20   
  21  from Bio import Alphabet 
  22  from Bio.Align import MultipleSeqAlignment 
  23  from Bio.Seq import Seq 
  24  from Bio.SeqFeature import SeqFeature, FeatureLocation 
  25  from Bio.SeqRecord import SeqRecord 
  26  from Bio import BiopythonWarning 
  27   
  28  from Bio.Phylo import BaseTree 
29 30 31 -class PhyloXMLWarning(BiopythonWarning):
32 """Warning for non-compliance with the phyloXML specification.""" 33 pass
34
35 36 -def _check_str(text, testfunc):
37 """Check a string using testfunc, and warn if there's no match.""" 38 if text is not None and not testfunc(text): 39 warnings.warn("String %s doesn't match the given regexp" % text, 40 PhyloXMLWarning, stacklevel=2)
41
42 43 # Core elements 44 45 -class PhyloElement(BaseTree.TreeElement):
46 """Base class for all PhyloXML objects."""
47
48 49 -class Phyloxml(PhyloElement):
50 """Root node of the PhyloXML document. 51 52 Contains an arbitrary number of Phylogeny elements, possibly followed by 53 elements from other namespaces. 54 55 :Parameters: 56 attributes : dict 57 (XML namespace definitions) 58 phylogenies : list 59 The phylogenetic trees 60 other : list 61 Arbitrary non-phyloXML elements, if any 62 """ 63
64 - def __init__(self, attributes, phylogenies=None, other=None):
65 self.attributes = { 66 # standard 67 "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", 68 "xmlns": "http://www.phyloxml.org", 69 "xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd", 70 } 71 if attributes: 72 self.attributes.update(attributes) 73 self.phylogenies = phylogenies or [] 74 self.other = other or []
75
76 - def __getitem__(self, index):
77 """Get a phylogeny by index or name.""" 78 if isinstance(index, (int, slice)): 79 return self.phylogenies[index] 80 if not isinstance(index, basestring): 81 raise KeyError("can't use %s as an index" % type(index)) 82 for tree in self.phylogenies: 83 if tree.name == index: 84 return tree 85 else: 86 raise KeyError("no phylogeny found with name " + repr(index))
87
88 - def __iter__(self):
89 """Iterate through the phylogenetic trees in this object.""" 90 return iter(self.phylogenies)
91
92 - def __len__(self):
93 """Number of phylogenetic trees in this object.""" 94 return len(self.phylogenies)
95
96 - def __str__(self):
97 return '%s([%s])' % (self.__class__.__name__, 98 ',\n'.join(map(str, self.phylogenies)))
99
100 101 -class Other(PhyloElement):
102 """Container for non-phyloXML elements in the tree. 103 104 Usually, an Other object will have either a 'value' or a non-empty list 105 of 'children', but not both. This is not enforced here, though. 106 107 :Parameters: 108 tag : string 109 local tag for the XML node 110 namespace : string 111 XML namespace for the node -- should not be the default phyloXML 112 namespace. 113 attributes : dict of strings 114 attributes on the XML node 115 value : string 116 text contained directly within this XML node 117 children : list 118 child nodes, if any (also `Other` instances) 119 """ 120
121 - def __init__(self, tag, namespace=None, attributes=None, value=None, 122 children=None):
123 self.tag = tag 124 self.namespace = namespace 125 self.attributes = attributes or {} 126 self.value = value 127 self.children = children or []
128
129 - def __iter__(self):
130 """Iterate through the children of this object (if any).""" 131 return iter(self.children)
132
133 134 -class Phylogeny(PhyloElement, BaseTree.Tree):
135 """A phylogenetic tree. 136 137 :Parameters: 138 root : Clade 139 the root node/clade of this tree 140 rooted : bool 141 True if this tree is rooted 142 rerootable : bool 143 True if this tree is rerootable 144 branch_length_unit : string 145 unit for branch_length values on clades 146 name : string 147 identifier for this tree, not required to be unique 148 id : Id 149 unique identifier for this tree 150 description : string 151 plain-text description 152 date : Date 153 date for the root node of this tree 154 confidences : list 155 Confidence objects for this tree 156 clade_relations : list 157 CladeRelation objects 158 sequence_relations : list 159 SequenceRelation objects 160 properties : list 161 Property objects 162 other : list 163 non-phyloXML elements (type `Other`) 164 """ 165
166 - def __init__(self, root=None, rooted=True, 167 rerootable=None, branch_length_unit=None, type=None, 168 # Child nodes 169 name=None, id=None, description=None, date=None, 170 # Collections 171 confidences=None, clade_relations=None, sequence_relations=None, 172 properties=None, other=None, 173 ):
174 assert isinstance(rooted, bool) 175 self.root = root 176 self.rooted = rooted 177 self.rerootable = rerootable 178 self.branch_length_unit = branch_length_unit 179 self.type = type 180 self.name = name 181 self.id = id 182 self.description = description 183 self.date = date 184 self.confidences = confidences or [] 185 self.clade_relations = clade_relations or [] 186 self.sequence_relations = sequence_relations or [] 187 self.properties = properties or [] 188 self.other = other or []
189 190 @classmethod
191 - def from_tree(cls, tree, **kwargs):
192 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree). 193 194 Keyword arguments are the usual `Phylogeny` constructor parameters. 195 """ 196 phy = cls( 197 root=Clade.from_clade(tree.root), 198 rooted=tree.rooted, 199 name=tree.name, 200 id=(tree.id is not None) and Id(str(tree.id)) or None) 201 phy.__dict__.update(kwargs) 202 return phy
203 204 @classmethod
205 - def from_clade(cls, clade, **kwargs):
206 """Create a new Phylogeny given a Newick or BaseTree Clade object. 207 208 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters. 209 """ 210 return Clade.from_clade(clade).to_phylogeny(**kwargs)
211
212 - def as_phyloxml(self):
213 """Return this tree, a PhyloXML-compatible Phylogeny object. 214 215 Overrides the `BaseTree` method. 216 """ 217 return self
218
219 - def to_phyloxml_container(self, **kwargs):
220 """Create a new Phyloxml object containing just this phylogeny.""" 221 return Phyloxml(kwargs, phylogenies=[self])
222
223 - def to_alignment(self):
224 """Construct an alignment from the aligned sequences in this tree.""" 225 def is_aligned_seq(elem): 226 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: 227 return True 228 return False
229 seqs = self._filter_search(is_aligned_seq, 'preorder', True) 230 try: 231 first_seq = next(seqs) 232 except StopIteration: 233 # No aligned sequences were found --> empty MSA 234 return MultipleSeqAlignment([]) 235 msa = MultipleSeqAlignment([first_seq.to_seqrecord()], 236 first_seq.get_alphabet()) 237 msa.extend(seq.to_seqrecord() for seq in seqs) 238 return msa
239 240 # Singular property for plural attribute
241 - def _get_confidence(self):
242 """Equivalent to self.confidences[0] if there is only 1 value. 243 244 See also: `Clade.confidence`, `Clade.taxonomy` 245 """ 246 if len(self.confidences) == 0: 247 return None 248 if len(self.confidences) > 1: 249 raise AttributeError("more than 1 confidence value available; " 250 "use Phylogeny.confidences") 251 return self.confidences[0]
252
253 - def _set_confidence(self, value):
254 if value is None: 255 # Special case: mirror the behavior of _get_confidence 256 self.confidences = [] 257 return 258 if isinstance(value, (float, int)): 259 value = Confidence(value) 260 elif not isinstance(value, Confidence): 261 raise ValueError("value must be a number or Confidence instance") 262 if len(self.confidences) == 0: 263 self.confidences.append(value) 264 elif len(self.confidences) == 1: 265 self.confidences[0] = value 266 else: 267 raise ValueError("multiple confidence values already exist; " 268 "use Phylogeny.confidences instead")
269
270 - def _del_confidence(self):
271 self.confidences = []
272 273 confidence = property(_get_confidence, _set_confidence, _del_confidence) 274
275 276 -class Clade(PhyloElement, BaseTree.Clade):
277 """Describes a branch of the current phylogenetic tree. 278 279 Used recursively, describes the topology of a phylogenetic tree. 280 281 Both ``color`` and ``width`` elements should be interpreted by client code 282 as applying to the whole clade, including all descendents, unless 283 overwritten in-sub clades. This module doesn't automatically assign these 284 attributes to sub-clades to achieve this cascade -- and neither should you. 285 286 :Parameters: 287 branch_length 288 parent branch length of this clade 289 id_source 290 link other elements to a clade (on the xml-level) 291 name : string 292 short label for this clade 293 confidences : list of Confidence objects 294 used to indicate the support for a clade/parent branch. 295 width : float 296 branch width for this clade (including branch from parent) 297 color : BranchColor 298 color used for graphical display of this clade 299 node_id 300 unique identifier for the root node of this clade 301 taxonomies : list 302 Taxonomy objects 303 sequences : list 304 Sequence objects 305 events : Events 306 describe such events as gene-duplications at the root node/parent 307 branch of this clade 308 binary_characters : BinaryCharacters 309 binary characters 310 distributions : list of Distribution objects 311 distribution(s) of this clade 312 date : Date 313 a date for the root node of this clade 314 references : list 315 Reference objects 316 properties : list 317 Property objects 318 clades : list Clade objects 319 Sub-clades 320 other : list of Other objects 321 non-phyloXML objects 322 """ 323
324 - def __init__(self, 325 # Attributes 326 branch_length=None, id_source=None, 327 # Child nodes 328 name=None, width=None, color=None, node_id=None, events=None, 329 binary_characters=None, date=None, 330 # Collections 331 confidences=None, taxonomies=None, sequences=None, 332 distributions=None, references=None, properties=None, clades=None, 333 other=None, 334 ):
335 self.branch_length = branch_length 336 self.id_source = id_source 337 self.name = name 338 self.width = width 339 self.color = color 340 self.node_id = node_id 341 self.events = events 342 self.binary_characters = binary_characters 343 self.date = date 344 self.confidences = confidences or [] 345 self.taxonomies = taxonomies or [] 346 self.sequences = sequences or [] 347 self.distributions = distributions or [] 348 self.references = references or [] 349 self.properties = properties or [] 350 self.clades = clades or [] 351 self.other = other or []
352 353 @classmethod
354 - def from_clade(cls, clade, **kwargs):
355 """Create a new PhyloXML Clade from a Newick or BaseTree Clade object. 356 357 Keyword arguments are the usual PhyloXML Clade constructor parameters. 358 """ 359 new_clade = cls(branch_length=clade.branch_length, 360 name=clade.name) 361 new_clade.clades = [cls.from_clade(c) for c in clade] 362 new_clade.confidence = clade.confidence 363 new_clade.width = clade.width 364 new_clade.color = (BranchColor( 365 clade.color.red, clade.color.green, clade.color.blue) 366 if clade.color else None) 367 new_clade.__dict__.update(kwargs) 368 return new_clade
369
370 - def to_phylogeny(self, **kwargs):
371 """Create a new phylogeny containing just this clade.""" 372 phy = Phylogeny(root=self, date=self.date) 373 phy.__dict__.update(kwargs) 374 return phy
375 376 # Shortcuts for list attributes that are usually only 1 item 377 # NB: Duplicated from Phylogeny class
378 - def _get_confidence(self):
379 if len(self.confidences) == 0: 380 return None 381 if len(self.confidences) > 1: 382 raise AttributeError("more than 1 confidence value available; " 383 "use Clade.confidences") 384 return self.confidences[0]
385
386 - def _set_confidence(self, value):
387 if value is None: 388 # Special case: mirror the behavior of _get_confidence 389 self.confidences = [] 390 return 391 if isinstance(value, (float, int)): 392 value = Confidence(value) 393 elif not isinstance(value, Confidence): 394 raise ValueError("value must be a number or Confidence instance") 395 if len(self.confidences) == 0: 396 self.confidences.append(value) 397 elif len(self.confidences) == 1: 398 self.confidences[0] = value 399 else: 400 raise ValueError("multiple confidence values already exist; " 401 "use Phylogeny.confidences instead")
402
403 - def _del_confidence(self):
404 self.confidences = []
405 406 confidence = property(_get_confidence, _set_confidence, _del_confidence) 407
408 - def _get_taxonomy(self):
409 if len(self.taxonomies) == 0: 410 return None 411 if len(self.taxonomies) > 1: 412 raise AttributeError("more than 1 taxonomy value available; " 413 "use Clade.taxonomies") 414 return self.taxonomies[0]
415
416 - def _set_taxonomy(self, value):
417 if not isinstance(value, Taxonomy): 418 raise ValueError("assigned value must be a Taxonomy instance") 419 if len(self.taxonomies) == 0: 420 self.taxonomies.append(value) 421 elif len(self.taxonomies) == 1: 422 self.taxonomies[0] = value 423 else: 424 raise ValueError("multiple taxonomy values already exist; " 425 "use Phylogeny.taxonomies instead")
426 427 taxonomy = property(_get_taxonomy, _set_taxonomy)
428
429 430 # PhyloXML wrapper for a special BaseTree attribute 431 432 -class BranchColor(PhyloElement, BaseTree.BranchColor):
433
434 - def __init__(self, *args, **kwargs):
435 BaseTree.BranchColor.__init__(self, *args, **kwargs)
436
437 438 # PhyloXML-specific complex types 439 440 -class Accession(PhyloElement):
441 """Captures the local part in a sequence identifier. 442 443 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value`` 444 is 'P17304' and the ``source`` attribute is 'UniProtKB'. 445 """ 446
447 - def __init__(self, value, source):
448 self.value = value 449 self.source = source
450
451 - def __str__(self):
452 """Show the class name and an identifying attribute.""" 453 return '%s:%s' % (self.source, self.value)
454
455 456 -class Annotation(PhyloElement):
457 """The annotation of a molecular sequence. 458 459 It is recommended to annotate by using the optional 'ref' attribute. 460 461 :Parameters: 462 ref : string 463 reference string, e.g. 'GO:0008270', 464 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1' 465 source : string 466 plain-text source for this annotation 467 evidence : str 468 describe evidence as free text (e.g. 'experimental') 469 desc : string 470 free text description 471 confidence : Confidence 472 state the type and value of support (type Confidence) 473 properties : list 474 typed and referenced annotations from external resources 475 uri : Uri 476 link 477 """ 478 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 479
480 - def __init__(self, 481 # Attributes 482 ref=None, source=None, evidence=None, type=None, 483 # Child nodes 484 desc=None, confidence=None, uri=None, 485 # Collection 486 properties=None):
487 _check_str(ref, self.re_ref.match) 488 self.ref = ref 489 self.source = source 490 self.evidence = evidence 491 self.type = type 492 self.desc = desc 493 self.confidence = confidence 494 self.uri = uri 495 self.properties = properties or []
496
497 498 -class BinaryCharacters(PhyloElement):
499 """Binary characters at the root of a clade. 500 501 The names and/or counts of binary characters present, gained, and lost 502 at the root of a clade. 503 """ 504
505 - def __init__(self, 506 # Attributes 507 type=None, gained_count=None, lost_count=None, present_count=None, 508 absent_count=None, 509 # Child nodes (flattened into collections) 510 gained=None, lost=None, present=None, absent=None):
511 self.type = type 512 self.gained_count = gained_count 513 self.lost_count = lost_count 514 self.present_count = present_count 515 self.absent_count = absent_count 516 self.gained = gained or [] 517 self.lost = lost or [] 518 self.present = present or [] 519 self.absent = absent or []
520
521 522 -class CladeRelation(PhyloElement):
523 """Expresses a typed relationship between two clades. 524 525 For example, this could be used to describe multiple parents of a clade. 526 527 @type id_ref_0: str 528 @type id_ref_1: str 529 @type distance: str 530 @type type: str 531 532 @type confidence: Confidence 533 """ 534
535 - def __init__(self, type, id_ref_0, id_ref_1, 536 distance=None, confidence=None):
537 self.distance = distance 538 self.type = type 539 self.id_ref_0 = id_ref_0 540 self.id_ref_1 = id_ref_1 541 self.confidence = confidence
542
543 544 -class Confidence(PhyloElement):
545 """A general purpose confidence element. 546 547 For example, this can be used to express the bootstrap support value of a 548 clade (in which case the `type` attribute is 'bootstrap'). 549 550 :Parameters: 551 value : float 552 confidence value 553 type : string 554 label for the type of confidence, e.g. 'bootstrap' 555 """ 556
557 - def __init__(self, value, type='unknown'):
558 self.value = value 559 self.type = type
560 561 # Comparison operators 562
563 - def __hash__(self):
564 """Return the hash value of the object. 565 566 Hash values are integers. They are used to quickly compare dictionary 567 keys during a dictionary lookup. Numeric values that compare equal have 568 the same hash value (even if they are of different types, as is the 569 case for 1 and 1.0). 570 """ 571 return id(self)
572
573 - def __eq__(self, other):
574 if isinstance(other, Confidence): 575 return self.value == other.value 576 return self.value == other
577
578 - def __ne__(self, other):
579 if isinstance(other, Confidence): 580 return self.value != other.value 581 return self.value != other
582 583 # Ordering -- see functools.total_ordering in Py2.7 584
585 - def __lt__(self, other):
586 if isinstance(other, Confidence): 587 return self.value < other.value 588 return self.value < other
589
590 - def __le__(self, other):
591 return self < other or self == other
592
593 - def __gt__(self, other):
594 return not (self <= other)
595
596 - def __ge__(self, other):
597 return not (self.value < other)
598 599 # Arithmetic operators, including reverse 600
601 - def __add__(self, other):
602 return self.value + other
603
604 - def __radd__(self, other):
605 return other + self.value
606
607 - def __sub__(self, other):
608 return self.value - other
609
610 - def __rsub__(self, other):
611 return other - self.value
612
613 - def __mul__(self, other):
614 return self.value * other
615
616 - def __rmul__(self, other):
617 return other * self.value
618
619 - def __div__(self, other):
620 return self.value.__div__(other)
621
622 - def __rdiv__(self, other):
623 return other.__div__(self.value)
624
625 - def __truediv__(self, other):
626 """Rational-style division in Py3.0+. 627 628 Also active in Py2.5+ with __future__.division import. 629 """ 630 return self.value / other
631
632 - def __rtruediv__(self, other):
633 return other / self.value
634
635 - def __floordiv__(self, other):
636 """C-style and old-style division in Py3.0+. 637 638 Also active in Py2.5+ with __future__.division import. 639 """ 640 return self.value.__floordiv__(other)
641
642 - def __rfloordiv__(self, other):
643 return other.__floordiv__(self.value)
644
645 - def __mod__(self, other):
646 return self.value % other
647
648 - def __rmod__(self, other):
649 return other % self.value
650
651 - def __divmod__(self, other):
652 return divmod(self.value, other)
653
654 - def __rdivmod__(self, other):
655 return divmod(other, self.value)
656
657 - def __pow__(self, other, modulo=None):
658 if modulo is not None: 659 return pow(self.value, other, modulo) 660 return pow(self.value, other)
661
662 - def __rpow__(self, other):
663 return pow(other, self.value)
664 665 # Unary arithmetic operations: -, +, abs() 666
667 - def __neg__(self):
668 return -self.value
669
670 - def __pos__(self):
671 return self.value
672
673 - def __abs__(self):
674 return abs(self.value)
675 676 # Explicit coercion to numeric types: int, long, float 677
678 - def __float__(self):
679 return float(self.value)
680
681 - def __int__(self):
682 return int(self.value)
683
684 - def __long__(self):
685 return long(self.value)
686
687 688 -class Date(PhyloElement):
689 """A date associated with a clade/node. 690 691 Its value can be numerical by using the 'value' element and/or free text 692 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it 693 is recommended to employ the 'unit' attribute. 694 695 :Parameters: 696 unit : string 697 type of numerical value (e.g. 'mya' for 'million years ago') 698 value : float 699 the date value 700 desc : string 701 plain-text description of the date 702 minimum : float 703 lower bound on the date value 704 maximum : float 705 upper bound on the date value 706 """ 707
708 - def __init__(self, value=None, unit=None, desc=None, 709 minimum=None, maximum=None):
710 self.value = value 711 self.unit = unit 712 self.desc = desc 713 self.minimum = minimum 714 self.maximum = maximum
715
716 - def __str__(self):
717 """Show the class name and the human-readable date.""" 718 if self.unit and self.value is not None: 719 return '%s %s' % (self.value, self.unit) 720 if self.desc is not None: 721 return self.desc 722 return self.__class__.__name__
723
724 725 -class Distribution(PhyloElement):
726 """Geographic distribution of the items of a clade (species, sequences). 727 728 Intended for phylogeographic applications. 729 730 :Parameters: 731 desc : string 732 free-text description of the location 733 points : list of `Point` objects 734 coordinates (similar to the 'Point' element in Google's KML format) 735 polygons : list of `Polygon` objects 736 coordinate sets defining geographic regions 737 """ 738
739 - def __init__(self, desc=None, points=None, polygons=None):
740 self.desc = desc 741 self.points = points or [] 742 self.polygons = polygons or []
743
744 745 -class DomainArchitecture(PhyloElement):
746 """Domain architecture of a protein. 747 748 :Parameters: 749 length : int 750 total length of the protein sequence 751 domains : list ProteinDomain objects 752 the domains within this protein 753 """ 754
755 - def __init__(self, length=None, domains=None):
756 self.length = length 757 self.domains = domains
758
759 760 -class Events(PhyloElement):
761 """Events at the root node of a clade (e.g. one gene duplication). 762 763 All attributes are set to None by default, but this object can also be 764 treated as a dictionary, in which case None values are treated as missing 765 keys and deleting a key resets that attribute's value back to None. 766 """ 767 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other', 768 'mixed', 'unassigned')) 769
770 - def __init__(self, type=None, duplications=None, speciations=None, 771 losses=None, confidence=None):
772 _check_str(type, self.ok_type.__contains__) 773 self.type = type 774 self.duplications = duplications 775 self.speciations = speciations 776 self.losses = losses 777 self.confidence = confidence
778
779 - def items(self):
780 return [(k, v) for k, v in self.__dict__.items() if v is not None]
781
782 - def keys(self):
783 return [k for k, v in self.__dict__.items() if v is not None]
784
785 - def values(self):
786 return [v for v in self.__dict__.values() if v is not None]
787
788 - def __len__(self):
789 # TODO - Better way to do this? 790 return len(self.values())
791
792 - def __getitem__(self, key):
793 if not hasattr(self, key): 794 raise KeyError(key) 795 val = getattr(self, key) 796 if val is None: 797 raise KeyError("%s has not been set in this object" % repr(key)) 798 return val
799
800 - def __setitem__(self, key, val):
801 setattr(self, key, val)
802
803 - def __delitem__(self, key):
804 setattr(self, key, None)
805
806 - def __iter__(self):
807 return iter(self.keys())
808
809 - def __contains__(self, key):
810 return (hasattr(self, key) and getattr(self, key) is not None)
811
812 813 -class Id(PhyloElement):
814 """A general-purpose identifier element. 815 816 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI, 817 along with the value itself. 818 """ 819
820 - def __init__(self, value, provider=None):
821 self.value = value 822 self.provider = provider
823
824 - def __str__(self):
825 if self.provider is not None: 826 return '%s:%s' % (self.provider, self.value) 827 return self.value
828
829 830 -class MolSeq(PhyloElement):
831 """Store a molecular sequence. 832 833 :Parameters: 834 value : string 835 the sequence itself 836 is_aligned : bool 837 True if this sequence is aligned with the others (usually meaning 838 all aligned seqs are the same length and gaps may be present) 839 """ 840 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+') 841
842 - def __init__(self, value, is_aligned=None):
843 _check_str(value, self.re_value.match) 844 self.value = value 845 self.is_aligned = is_aligned
846
847 - def __str__(self):
848 return self.value
849
850 851 -class Point(PhyloElement):
852 """Geographic coordinates of a point, with an optional altitude. 853 854 Used by element 'Distribution'. 855 856 :Parameters: 857 geodetic_datum : string, required 858 the geodetic datum (also called 'map datum'). For example, Google's 859 KML uses 'WGS84'. 860 lat : numeric 861 latitude 862 long : numeric 863 longitude 864 alt : numeric 865 altitude 866 alt_unit : string 867 unit for the altitude (e.g. 'meter') 868 """ 869
870 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
871 self.geodetic_datum = geodetic_datum 872 self.lat = lat 873 self.long = long 874 self.alt = alt 875 self.alt_unit = alt_unit
876
877 878 -class Polygon(PhyloElement):
879 """A polygon defined by a list of 'Points' (used by element 'Distribution'). 880 881 :param points: list of 3 or more points representing vertices. 882 """ 883
884 - def __init__(self, points=None):
885 self.points = points or []
886
887 - def __str__(self):
888 return '%s([%s])' % (self.__class__.__name__, 889 ',\n'.join(map(str, self.points)))
890
891 892 -class Property(PhyloElement):
893 """A typed and referenced property from an external resources. 894 895 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects. 896 897 :Parameters: 898 value : string 899 the value of the property 900 ref : string 901 reference to an external resource, e.g. "NOAA:depth" 902 applies_to : string 903 indicates the item to which a property applies to (e.g. 'node' for 904 the parent node of a clade, 'parent_branch' for the parent branch of 905 a clade, or just 'clade'). 906 datatype : string 907 the type of a property; limited to xsd-datatypes 908 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal', 909 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI'). 910 unit : string (optional) 911 the unit of the property, e.g. "METRIC:m" 912 id_ref : Id (optional) 913 allows to attached a property specifically to one element (on the 914 xml-level) 915 """ 916 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+') 917 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation', 918 'parent_branch', 'other')) 919 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float', 920 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date', 921 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay', 922 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI', 923 'xsd:normalizedString', 'xsd:token', 'xsd:integer', 924 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int', 925 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong', 926 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte', 927 'xsd:positiveInteger')) 928
929 - def __init__(self, value, ref, applies_to, datatype, 930 unit=None, id_ref=None):
931 _check_str(ref, self.re_ref.match) 932 _check_str(applies_to, self.ok_applies_to.__contains__) 933 _check_str(datatype, self.ok_datatype.__contains__) 934 _check_str(unit, self.re_ref.match) 935 self.unit = unit 936 self.id_ref = id_ref 937 self.value = value 938 self.ref = ref 939 self.applies_to = applies_to 940 self.datatype = datatype
941
942 943 -class ProteinDomain(PhyloElement):
944 """Represents an individual domain in a domain architecture. 945 946 The locations use 0-based indexing, as most Python objects including 947 SeqFeature do, rather than the usual biological convention starting at 1. 948 This means the start and end attributes can be used directly as slice 949 indexes on Seq objects. 950 951 :Parameters: 952 start : non-negative integer 953 start of the domain on the sequence, using 0-based indexing 954 end : non-negative integer 955 end of the domain on the sequence 956 confidence : float 957 can be used to store e.g. E-values 958 id : string 959 unique identifier/name 960 """ 961
962 - def __init__(self, value, start, end, confidence=None, id=None):
963 self.value = value 964 self.start = start 965 self.end = end 966 self.confidence = confidence 967 self.id = id
968 969 @classmethod
970 - def from_seqfeature(cls, feat):
971 return ProteinDomain(feat.id, 972 feat.location.nofuzzy_start, 973 feat.location.nofuzzy_end, 974 confidence=feat.qualifiers.get('confidence'))
975
976 - def to_seqfeature(self):
977 feat = SeqFeature(location=FeatureLocation(self.start, self.end), 978 id=self.value) 979 if hasattr(self, 'confidence'): 980 feat.qualifiers['confidence'] = self.confidence 981 return feat
982
983 984 -class Reference(PhyloElement):
985 """Literature reference for a clade. 986 987 NB: Whenever possible, use the ``doi`` attribute instead of the free-text 988 ``desc`` element. 989 """ 990 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+') 991
992 - def __init__(self, doi=None, desc=None):
993 _check_str(doi, self.re_doi.match) 994 self.doi = doi 995 self.desc = desc
996
997 998 -class Sequence(PhyloElement):
999 """A molecular sequence (Protein, DNA, RNA) associated with a node. 1000 1001 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the 1002 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per 1003 node. 1004 1005 :Parameters: 1006 type : {'dna', 'rna', 'protein'} 1007 type of molecule this sequence represents 1008 id_ref : string 1009 reference to another resource 1010 id_source : string 1011 source for the reference 1012 symbol : string 1013 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars) 1014 accession : Accession 1015 accession code for this sequence. 1016 name : string 1017 full name of the sequence, e.g. 'muscle Actin' 1018 location 1019 location of a sequence on a genome/chromosome. 1020 mol_seq : MolSeq 1021 the molecular sequence itself 1022 uri : Uri 1023 link 1024 annotations : list of Annotation objects 1025 annotations on this sequence 1026 domain_architecture : DomainArchitecture 1027 protein domains on this sequence 1028 other : list of Other objects 1029 non-phyloXML elements 1030 """ 1031 alphabets = {'dna': Alphabet.generic_dna, 1032 'rna': Alphabet.generic_rna, 1033 'protein': Alphabet.generic_protein} 1034 re_symbol = re.compile(r'\S{1,10}') 1035
1036 - def __init__(self, 1037 # Attributes 1038 type=None, id_ref=None, id_source=None, 1039 # Child nodes 1040 symbol=None, accession=None, name=None, location=None, 1041 mol_seq=None, uri=None, domain_architecture=None, 1042 # Collections 1043 annotations=None, other=None, 1044 ):
1045 _check_str(type, self.alphabets.__contains__) 1046 _check_str(symbol, self.re_symbol.match) 1047 self.type = type 1048 self.id_ref = id_ref 1049 self.id_source = id_source 1050 self.symbol = symbol 1051 self.accession = accession 1052 self.name = name 1053 self.location = location 1054 self.mol_seq = mol_seq 1055 self.uri = uri 1056 self.domain_architecture = domain_architecture 1057 self.annotations = annotations or [] 1058 self.other = other or []
1059 1060 @classmethod
1061 - def from_seqrecord(cls, record, is_aligned=None):
1062 """Create a new PhyloXML Sequence from a SeqRecord object.""" 1063 if is_aligned is None: 1064 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped) 1065 params = { 1066 'accession': Accession(record.id, ''), 1067 'symbol': record.name, 1068 'name': record.description, 1069 'mol_seq': MolSeq(str(record.seq), is_aligned), 1070 } 1071 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 1072 params['type'] = 'dna' 1073 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 1074 params['type'] = 'rna' 1075 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 1076 params['type'] = 'protein' 1077 1078 # Unpack record.annotations 1079 for key in ('id_ref', 'id_source', 'location'): 1080 if key in record.annotations: 1081 params[key] = record.annotations[key] 1082 if isinstance(record.annotations.get('uri'), dict): 1083 params['uri'] = Uri(**record.annotations['uri']) 1084 # Build a Sequence.annotation object 1085 if record.annotations.get('annotations'): 1086 params['annotations'] = [] 1087 for annot in record.annotations['annotations']: 1088 ann_args = {} 1089 for key in ('ref', 'source', 'evidence', 'type', 'desc'): 1090 if key in annot: 1091 ann_args[key] = annot[key] 1092 if isinstance(annot.get('confidence'), list): 1093 ann_args['confidence'] = Confidence( 1094 *annot['confidence']) 1095 if isinstance(annot.get('properties'), list): 1096 ann_args['properties'] = [Property(**prop) 1097 for prop in annot['properties'] 1098 if isinstance(prop, dict)] 1099 params['annotations'].append(Annotation(**ann_args)) 1100 1101 # Unpack record.features 1102 if record.features: 1103 params['domain_architecture'] = DomainArchitecture( 1104 length=len(record.seq), 1105 domains=[ProteinDomain.from_seqfeature(feat) 1106 for feat in record.features]) 1107 1108 return Sequence(**params)
1109
1110 - def to_seqrecord(self):
1111 """Create a SeqRecord object from this Sequence instance. 1112 1113 The seqrecord.annotations dictionary is packed like so:: 1114 1115 { # Sequence attributes with no SeqRecord equivalent: 1116 'id_ref': self.id_ref, 1117 'id_source': self.id_source, 1118 'location': self.location, 1119 'uri': { 'value': self.uri.value, 1120 'desc': self.uri.desc, 1121 'type': self.uri.type }, 1122 # Sequence.annotations attribute (list of Annotations) 1123 'annotations': [{'ref': ann.ref, 1124 'source': ann.source, 1125 'evidence': ann.evidence, 1126 'type': ann.type, 1127 'confidence': [ann.confidence.value, 1128 ann.confidence.type], 1129 'properties': [{'value': prop.value, 1130 'ref': prop.ref, 1131 'applies_to': prop.applies_to, 1132 'datatype': prop.datatype, 1133 'unit': prop.unit, 1134 'id_ref': prop.id_ref} 1135 for prop in ann.properties], 1136 } for ann in self.annotations], 1137 } 1138 """ 1139 def clean_dict(dct): 1140 """Remove None-valued items from a dictionary.""" 1141 return dict((key, val) for key, val in dct.items() 1142 if val is not None)
1143 1144 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()), 1145 **clean_dict({ 1146 'id': str(self.accession), 1147 'name': self.symbol, 1148 'description': self.name, 1149 # 'dbxrefs': None, 1150 })) 1151 if self.domain_architecture: 1152 seqrec.features = [dom.to_seqfeature() 1153 for dom in self.domain_architecture.domains] 1154 # Sequence attributes with no SeqRecord equivalent 1155 seqrec.annotations = clean_dict({ 1156 'id_ref': self.id_ref, 1157 'id_source': self.id_source, 1158 'location': self.location, 1159 'uri': self.uri and clean_dict({ 1160 'value': self.uri.value, 1161 'desc': self.uri.desc, 1162 'type': self.uri.type, 1163 }), 1164 'annotations': self.annotations and [ 1165 clean_dict({ 1166 'ref': ann.ref, 1167 'source': ann.source, 1168 'evidence': ann.evidence, 1169 'type': ann.type, 1170 'confidence': ann.confidence and [ 1171 ann.confidence.value, 1172 ann.confidence.type], 1173 'properties': [clean_dict({ 1174 'value': prop.value, 1175 'ref': prop.ref, 1176 'applies_to': prop.applies_to, 1177 'datatype': prop.datatype, 1178 'unit': prop.unit, 1179 'id_ref': prop.id_ref}) 1180 for prop in ann.properties], 1181 }) for ann in self.annotations], 1182 }) 1183 return seqrec
1184
1185 - def get_alphabet(self):
1186 alph = self.alphabets.get(self.type, Alphabet.generic_alphabet) 1187 if self.mol_seq and self.mol_seq.is_aligned: 1188 return Alphabet.Gapped(alph) 1189 return alph
1190
1191 1192 -class SequenceRelation(PhyloElement):
1193 """Express a typed relationship between two sequences. 1194 1195 For example, this could be used to describe an orthology (in which case 1196 attribute 'type' is 'orthology'). 1197 1198 :Parameters: 1199 id_ref_0 : Id 1200 first sequence reference identifier 1201 id_ref_1 : Id 1202 second sequence reference identifier 1203 distance : float 1204 distance between the two sequences 1205 type : restricted string 1206 describe the type of relationship 1207 confidence : Confidence 1208 confidence value for this relation 1209 """ 1210 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology', 1211 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other')) 1212
1213 - def __init__(self, type, id_ref_0, id_ref_1, 1214 distance=None, confidence=None):
1215 _check_str(type, self.ok_type.__contains__) 1216 self.distance = distance 1217 self.type = type 1218 self.id_ref_0 = id_ref_0 1219 self.id_ref_1 = id_ref_1 1220 self.confidence = confidence
1221
1222 1223 -class Taxonomy(PhyloElement):
1224 """Describe taxonomic information for a clade. 1225 1226 :Parameters: 1227 id_source : Id 1228 link other elements to a taxonomy (on the XML level) 1229 id : Id 1230 unique identifier of a taxon, e.g. Id('6500', 1231 provider='ncbi_taxonomy') for the California sea hare 1232 code : restricted string 1233 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the 1234 California sea hare 'Aplysia californica' 1235 scientific_name : string 1236 the standard scientific name for this organism, e.g. 'Aplysia 1237 californica' for the California sea hare 1238 authority : string 1239 keep the authority, such as 'J. G. Cooper, 1863', associated with 1240 the 'scientific_name' 1241 common_names : list of strings 1242 common names for this organism 1243 synonyms : list of strings 1244 synonyms for this taxon? 1245 rank : restricted string 1246 taxonomic rank 1247 uri : Uri 1248 link 1249 other : list of Other objects 1250 non-phyloXML elements 1251 """ 1252 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}') 1253 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom', 1254 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum', 1255 'superdivision', 'division', 'subdivision', 'infradivision', 1256 'superclass', 'class', 'subclass', 'infraclass', 'superlegion', 1257 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort', 1258 'subcohort', 'infracohort', 'superorder', 'order', 'suborder', 1259 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe', 1260 'infratribe', 'genus', 'subgenus', 'superspecies', 'species', 1261 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar', 1262 'unknown', 'other')) 1263
1264 - def __init__(self, 1265 # Attributes 1266 id_source=None, 1267 # Child nodes 1268 id=None, code=None, scientific_name=None, authority=None, 1269 rank=None, uri=None, 1270 # Collections 1271 common_names=None, synonyms=None, other=None, 1272 ):
1273 _check_str(code, self.re_code.match) 1274 _check_str(rank, self.ok_rank.__contains__) 1275 self.id_source = id_source 1276 self.id = id 1277 self.code = code 1278 self.scientific_name = scientific_name 1279 self.authority = authority 1280 self.rank = rank 1281 self.uri = uri 1282 self.common_names = common_names or [] 1283 self.synonyms = synonyms or [] 1284 self.other = other or []
1285
1286 - def __str__(self):
1287 """Show the class name and an identifying attribute.""" 1288 if self.code is not None: 1289 return self.code 1290 if self.scientific_name is not None: 1291 return self.scientific_name 1292 if self.rank is not None: 1293 return self.rank 1294 if self.id is not None: 1295 return str(self.id) 1296 return self.__class__.__name__
1297
1298 1299 -class Uri(PhyloElement):
1300 """A uniform resource identifier. 1301 1302 In general, this is expected to be an URL (for example, to link to an image 1303 on a website, in which case the ``type`` attribute might be 'image' and 1304 ``desc`` might be 'image of a California sea hare'). 1305 """ 1306
1307 - def __init__(self, value, desc=None, type=None):
1308 self.value = value 1309 self.desc = desc 1310 self.type = type
1311
1312 - def __str__(self):
1313 if self.value: 1314 return self.value 1315 return repr(self)
1316