1
2
3
4
5
6 """Classes corresponding to phyloXML elements.
7
8 See Also
9 --------
10 Official specification:
11 http://phyloxml.org/
12 Journal article:
13 Han and Zmasek (2009), doi:10.1186/1471-2105-10-356
14 """
15 __docformat__ = "restructuredtext en"
16
17 import re
18 import warnings
19
20 from Bio import Alphabet
21 from Bio.Align import MultipleSeqAlignment
22 from Bio.Seq import Seq
23 from Bio.SeqFeature import SeqFeature, FeatureLocation
24 from Bio.SeqRecord import SeqRecord
25 from Bio import BiopythonWarning
26
27 from Bio.Phylo import BaseTree
31 """Warning for non-compliance with the phyloXML specification."""
32 pass
33
36 """Check a string using testfunc, and warn if there's no match."""
37 if text is not None and not testfunc(text):
38 warnings.warn("String %s doesn't match the given regexp" % text,
39 PhyloXMLWarning, stacklevel=2)
40
45 """Base class for all PhyloXML objects."""
46
49 """Root node of the PhyloXML document.
50
51 Contains an arbitrary number of Phylogeny elements, possibly followed by
52 elements from other namespaces.
53
54 :Parameters:
55 attributes : dict
56 (XML namespace definitions)
57 phylogenies : list
58 The phylogenetic trees
59 other : list
60 Arbitrary non-phyloXML elements, if any
61 """
62 - def __init__(self, attributes, phylogenies=None, other=None):
63 self.attributes = {
64 "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
65 "xmlns": "http://www.phyloxml.org",
66 "xsi:schemaLocation": "http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd",
67 }
68 if attributes:
69 self.attributes.update(attributes)
70 self.phylogenies = phylogenies or []
71 self.other = other or []
72
74 """Get a phylogeny by index or name."""
75 if isinstance(index, int) or isinstance(index, slice):
76 return self.phylogenies[index]
77 if not isinstance(index, basestring):
78 raise KeyError("can't use %s as an index" % type(index))
79 for tree in self.phylogenies:
80 if tree.name == index:
81 return tree
82 else:
83 raise KeyError("no phylogeny found with name " + repr(index))
84
86 """Iterate through the phylogenetic trees in this object."""
87 return iter(self.phylogenies)
88
90 """Number of phylogenetic trees in this object."""
91 return len(self.phylogenies)
92
94 return '%s([%s])' % (self.__class__.__name__,
95 ',\n'.join(map(str, self.phylogenies)))
96
97
98 -class Other(PhyloElement):
99 """Container for non-phyloXML elements in the tree.
100
101 Usually, an Other object will have either a 'value' or a non-empty list
102 of 'children', but not both. This is not enforced here, though.
103
104 :Parameters:
105 tag : string
106 local tag for the XML node
107 namespace : string
108 XML namespace for the node -- should not be the default phyloXML
109 namespace.
110 attributes : dict of strings
111 attributes on the XML node
112 value : string
113 text contained directly within this XML node
114 children : list
115 child nodes, if any (also `Other` instances)
116 """
117 - def __init__(self, tag, namespace=None, attributes=None, value=None,
118 children=None):
119 self.tag = tag
120 self.namespace = namespace
121 self.attributes = attributes or {}
122 self.value = value
123 self.children = children or []
124
126 """Iterate through the children of this object (if any)."""
127 return iter(self.children)
128
129
130 -class Phylogeny(PhyloElement, BaseTree.Tree):
131 """A phylogenetic tree.
132
133 :Parameters:
134 root : Clade
135 the root node/clade of this tree
136 rooted : bool
137 True if this tree is rooted
138 rerootable : bool
139 True if this tree is rerootable
140 branch_length_unit : string
141 unit for branch_length values on clades
142 name : string
143 identifier for this tree, not required to be unique
144 id : Id
145 unique identifier for this tree
146 description : string
147 plain-text description
148 date : Date
149 date for the root node of this tree
150 confidences : list
151 Confidence objects for this tree
152 clade_relations : list
153 CladeRelation objects
154 sequence_relations : list
155 SequenceRelation objects
156 properties : list
157 Property objects
158 other : list
159 non-phyloXML elements (type `Other`)
160 """
161 - def __init__(self, root=None, rooted=True,
162 rerootable=None, branch_length_unit=None, type=None,
163
164 name=None, id=None, description=None, date=None,
165
166 confidences=None, clade_relations=None, sequence_relations=None,
167 properties=None, other=None,
168 ):
169 assert isinstance(rooted, bool)
170 self.root = root
171 self.rooted = rooted
172 self.rerootable = rerootable
173 self.branch_length_unit = branch_length_unit
174 self.type = type
175 self.name = name
176 self.id = id
177 self.description = description
178 self.date = date
179 self.confidences = confidences or []
180 self.clade_relations = clade_relations or []
181 self.sequence_relations = sequence_relations or []
182 self.properties = properties or []
183 self.other = other or []
184
185 @classmethod
187 """Create a new Phylogeny given a Tree (from Newick/Nexus or BaseTree).
188
189 Keyword arguments are the usual `Phylogeny` constructor parameters.
190 """
191 phy = cls(
192 root=Clade.from_clade(tree.root),
193 rooted=tree.rooted,
194 name=tree.name,
195 id=(tree.id is not None) and Id(str(tree.id)) or None)
196 phy.__dict__.update(kwargs)
197 return phy
198
199 @classmethod
201 """Create a new Phylogeny given a Newick or BaseTree Clade object.
202
203 Keyword arguments are the usual `PhyloXML.Clade` constructor parameters.
204 """
205 return Clade.from_clade(clade).to_phylogeny(**kwargs)
206
208 """Return this tree, a PhyloXML-compatible Phylogeny object.
209
210 Overrides the `BaseTree` method.
211 """
212 return self
213
215 """Create a new Phyloxml object containing just this phylogeny."""
216 return Phyloxml(kwargs, phylogenies=[self])
217
219 """Construct an alignment from the aligned sequences in this tree."""
220 def is_aligned_seq(elem):
221 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
222 return True
223 return False
224 seqs = self._filter_search(is_aligned_seq, 'preorder', True)
225 try:
226 first_seq = seqs.next()
227 except StopIteration:
228
229 return MultipleSeqAlignment([])
230 msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
231 first_seq.get_alphabet())
232 msa.extend(seq.to_seqrecord() for seq in seqs)
233 return msa
234
235
237 """Equivalent to self.confidences[0] if there is only 1 value.
238
239 See also: `Clade.confidence`, `Clade.taxonomy`
240 """
241 if len(self.confidences) == 0:
242 return None
243 if len(self.confidences) > 1:
244 raise AttributeError("more than 1 confidence value available; "
245 "use Phylogeny.confidences")
246 return self.confidences[0]
247
249 if value is None:
250
251 self.confidences = []
252 return
253 if isinstance(value, float) or isinstance(value, int):
254 value = Confidence(value)
255 elif not isinstance(value, Confidence):
256 raise ValueError("value must be a number or Confidence instance")
257 if len(self.confidences) == 0:
258 self.confidences.append(value)
259 elif len(self.confidences) == 1:
260 self.confidences[0] = value
261 else:
262 raise ValueError("multiple confidence values already exist; "
263 "use Phylogeny.confidences instead")
264
266 self.confidences = []
267
268 confidence = property(_get_confidence, _set_confidence, _del_confidence)
269
270
271 -class Clade(PhyloElement, BaseTree.Clade):
272 """Describes a branch of the current phylogenetic tree.
273
274 Used recursively, describes the topology of a phylogenetic tree.
275
276 Both ``color`` and ``width`` elements should be interpreted by client code
277 as applying to the whole clade, including all descendents, unless
278 overwritten in-sub clades. This module doesn't automatically assign these
279 attributes to sub-clades to achieve this cascade -- and neither should you.
280
281 :Parameters:
282 branch_length
283 parent branch length of this clade
284 id_source
285 link other elements to a clade (on the xml-level)
286 name : string
287 short label for this clade
288 confidences : list of Confidence objects
289 used to indicate the support for a clade/parent branch.
290 width : float
291 branch width for this clade (including branch from parent)
292 color : BranchColor
293 color used for graphical display of this clade
294 node_id
295 unique identifier for the root node of this clade
296 taxonomies : list
297 Taxonomy objects
298 sequences : list
299 Sequence objects
300 events : Events
301 describe such events as gene-duplications at the root node/parent
302 branch of this clade
303 binary_characters : BinaryCharacters
304 binary characters
305 distributions : list of Distribution objects
306 distribution(s) of this clade
307 date : Date
308 a date for the root node of this clade
309 references : list
310 Reference objects
311 properties : list
312 Property objects
313 clades : list Clade objects
314 Sub-clades
315 other : list of Other objects
316 non-phyloXML objects
317 """
318 - def __init__(self,
319
320 branch_length=None, id_source=None,
321
322 name=None, width=None, color=None, node_id=None, events=None,
323 binary_characters=None, date=None,
324
325 confidences=None, taxonomies=None, sequences=None,
326 distributions=None, references=None, properties=None, clades=None,
327 other=None,
328 ):
346
347 @classmethod
363
365 """Create a new phylogeny containing just this clade."""
366 phy = Phylogeny(root=self, date=self.date)
367 phy.__dict__.update(kwargs)
368 return phy
369
370
371
373 if len(self.confidences) == 0:
374 return None
375 if len(self.confidences) > 1:
376 raise AttributeError("more than 1 confidence value available; "
377 "use Clade.confidences")
378 return self.confidences[0]
379
381 if value is None:
382
383 self.confidences = []
384 return
385 if isinstance(value, float) or isinstance(value, int):
386 value = Confidence(value)
387 elif not isinstance(value, Confidence):
388 raise ValueError("value must be a number or Confidence instance")
389 if len(self.confidences) == 0:
390 self.confidences.append(value)
391 elif len(self.confidences) == 1:
392 self.confidences[0] = value
393 else:
394 raise ValueError("multiple confidence values already exist; "
395 "use Phylogeny.confidences instead")
396
398 self.confidences = []
399
400 confidence = property(_get_confidence, _set_confidence, _del_confidence)
401
403 if len(self.taxonomies) == 0:
404 return None
405 if len(self.taxonomies) > 1:
406 raise AttributeError("more than 1 taxonomy value available; "
407 "use Clade.taxonomies")
408 return self.taxonomies[0]
409
411 if not isinstance(value, Taxonomy):
412 raise ValueError("assigned value must be a Taxonomy instance")
413 if len(self.taxonomies) == 0:
414 self.taxonomies.append(value)
415 elif len(self.taxonomies) == 1:
416 self.taxonomies[0] = value
417 else:
418 raise ValueError("multiple taxonomy values already exist; "
419 "use Phylogeny.taxonomies instead")
420
421 taxonomy = property(_get_taxonomy, _set_taxonomy)
422
423
424
425
426 -class BranchColor(PhyloElement, BaseTree.BranchColor):
429
434 """Captures the local part in a sequence identifier.
435
436 Example: In ``UniProtKB:P17304``, the Accession instance attribute ``value``
437 is 'P17304' and the ``source`` attribute is 'UniProtKB'.
438 """
442
444 """Show the class name and an identifying attribute."""
445 return '%s:%s' % (self.source, self.value)
446
449 """The annotation of a molecular sequence.
450
451 It is recommended to annotate by using the optional 'ref' attribute.
452
453 :Parameters:
454 ref : string
455 reference string, e.g. 'GO:0008270',
456 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1'
457 source : string
458 plain-text source for this annotation
459 evidence : str
460 describe evidence as free text (e.g. 'experimental')
461 desc : string
462 free text description
463 confidence : Confidence
464 state the type and value of support (type Confidence)
465 properties : list
466 typed and referenced annotations from external resources
467 uri : Uri
468 link
469 """
470 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
471
472 - def __init__(self,
473
474 ref=None, source=None, evidence=None, type=None,
475
476 desc=None, confidence=None, uri=None,
477
478 properties=None):
488
491 """The names and/or counts of binary characters present, gained, and lost
492 at the root of a clade.
493 """
494 - def __init__(self,
495
496 type=None, gained_count=None, lost_count=None, present_count=None,
497 absent_count=None,
498
499 gained=None, lost=None, present=None, absent=None):
500 self.type=type
501 self.gained_count=gained_count
502 self.lost_count=lost_count
503 self.present_count=present_count
504 self.absent_count=absent_count
505 self.gained=gained or []
506 self.lost=lost or []
507 self.present=present or []
508 self.absent=absent or []
509
512 """Expresses a typed relationship between two clades.
513
514 For example, this could be used to describe multiple parents of a clade.
515
516 @type id_ref_0: str
517 @type id_ref_1: str
518 @type distance: str
519 @type type: str
520
521 @type confidence: Confidence
522 """
523 - def __init__(self, type, id_ref_0, id_ref_1,
524 distance=None, confidence=None):
530
533 """A general purpose confidence element.
534
535 For example, this can be used to express the bootstrap support value of a
536 clade (in which case the `type` attribute is 'bootstrap').
537
538 :Parameters:
539 value : float
540 confidence value
541 type : string
542 label for the type of confidence, e.g. 'bootstrap'
543 """
544 - def __init__(self, value, type='unknown'):
547
548
549
551 """Return the hash value of the object.
552
553 Hash values are integers. They are used to quickly compare dictionary
554 keys during a dictionary lookup. Numeric values that compare equal have
555 the same hash value (even if they are of different types, as is the
556 case for 1 and 1.0).
557 """
558 return id(self)
559
564
569
570
571
576
579
581 return not (self <= other)
582
585
586
587
590
593
596
599
602
605
608
611
613 """Rational-style division in Py3.0+.
614
615 Also active in Py2.5+ with __future__.division import.
616 """
617 return self.value / other
618
621
623 """C-style and old-style division in Py3.0+.
624
625 Also active in Py2.5+ with __future__.division import.
626 """
627 return self.value.__floordiv__(other)
628
631
634
637
640
643
644 - def __pow__(self, other, modulo=None):
645 if modulo is not None:
646 return pow(self.value, other, modulo)
647 return pow(self.value, other)
648
651
652
653
656
659
662
663
664
666 return float(self.value)
667
669 return int(self.value)
670
673
674
675 -class Date(PhyloElement):
676 """A date associated with a clade/node.
677
678 Its value can be numerical by using the 'value' element and/or free text
679 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it
680 is recommended to employ the 'unit' attribute.
681
682 :Parameters:
683 unit : string
684 type of numerical value (e.g. 'mya' for 'million years ago')
685 value : float
686 the date value
687 desc : string
688 plain-text description of the date
689 minimum : float
690 lower bound on the date value
691 maximum : float
692 upper bound on the date value
693 """
694 - def __init__(self, value=None, unit=None, desc=None,
695 minimum=None, maximum=None):
701
703 """Show the class name and the human-readable date."""
704 if self.unit and self.value is not None:
705 return '%s %s' % (self.value, self.unit)
706 if self.desc is not None:
707 return self.desc
708 return self.__class__.__name__
709
712 """Geographic distribution of the items of a clade (species, sequences).
713
714 Intended for phylogeographic applications.
715
716 :Parameters:
717 desc : string
718 free-text description of the location
719 points : list of `Point` objects
720 coordinates (similar to the 'Point' element in Google's KML format)
721 polygons : list of `Polygon` objects
722 coordinate sets defining geographic regions
723 """
724 - def __init__(self, desc=None, points=None, polygons=None):
725 self.desc = desc
726 self.points = points or []
727 self.polygons = polygons or []
728
729
730 -class DomainArchitecture(PhyloElement):
731 """Domain architecture of a protein.
732
733 :Parameters:
734 length : int
735 total length of the protein sequence
736 domains : list ProteinDomain objects
737 the domains within this protein
738 """
739 - def __init__(self, length=None, domains=None):
740 self.length = length
741 self.domains = domains
742
743
744 -class Events(PhyloElement):
745 """Events at the root node of a clade (e.g. one gene duplication).
746
747 All attributes are set to None by default, but this object can also be
748 treated as a dictionary, in which case None values are treated as missing
749 keys and deleting a key resets that attribute's value back to None.
750 """
751 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other',
752 'mixed', 'unassigned'))
753
754 - def __init__(self, type=None, duplications=None, speciations=None,
755 losses=None, confidence=None):
762
764 return [(k, v) for k, v in self.__dict__.iteritems() if v is not None]
765
767 return [k for k, v in self.__dict__.iteritems() if v is not None]
768
770 return [v for v in self.__dict__.itervalues() if v is not None]
771
774
776 if not hasattr(self, key):
777 raise KeyError(key)
778 val = getattr(self, key)
779 if val is None:
780 raise KeyError("%s has not been set in this object" % repr(key))
781 return val
782
784 setattr(self, key, val)
785
787 setattr(self, key, None)
788
790 return iter(self.keys())
791
793 return (hasattr(self, key) and getattr(self, key) is not None)
794
795
796 -class Id(PhyloElement):
797 """A general-purpose identifier element.
798
799 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI,
800 along with the value itself.
801 """
802 - def __init__(self, value, provider=None):
803 self.value = value
804 self.provider = provider
805
807 if self.provider is not None:
808 return '%s:%s' % (self.provider, self.value)
809 return self.value
810
811
812 -class MolSeq(PhyloElement):
813 """Store a molecular sequence.
814
815 :Parameters:
816 value : string
817 the sequence itself
818 is_aligned : bool
819 True if this sequence is aligned with the others (usually meaning
820 all aligned seqs are the same length and gaps may be present)
821 """
822 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+')
823
824 - def __init__(self, value, is_aligned=None):
828
831
832
833 -class Point(PhyloElement):
834 """Geographic coordinates of a point, with an optional altitude.
835
836 Used by element 'Distribution'.
837
838 :Parameters:
839 geodetic_datum : string, required
840 the geodetic datum (also called 'map datum'). For example, Google's
841 KML uses 'WGS84'.
842 lat : numeric
843 latitude
844 long : numeric
845 longitude
846 alt : numeric
847 altitude
848 alt_unit : string
849 unit for the altitude (e.g. 'meter')
850 """
851 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
852 self.geodetic_datum = geodetic_datum
853 self.lat = lat
854 self.long = long
855 self.alt = alt
856 self.alt_unit = alt_unit
857
860 """A polygon defined by a list of 'Points' (used by element 'Distribution').
861
862 :param points: list of 3 or more points representing vertices.
863 """
865 self.points = points or []
866
868 return '%s([%s])' % (self.__class__.__name__,
869 ',\n'.join(map(str, self.points)))
870
873 """A typed and referenced property from an external resources.
874
875 Can be attached to `Phylogeny`, `Clade`, and `Annotation` objects.
876
877 :Parameters:
878 value : string
879 the value of the property
880 ref : string
881 reference to an external resource, e.g. "NOAA:depth"
882 applies_to : string
883 indicates the item to which a property applies to (e.g. 'node' for
884 the parent node of a clade, 'parent_branch' for the parent branch of
885 a clade, or just 'clade').
886 datatype : string
887 the type of a property; limited to xsd-datatypes
888 (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer', 'xsd:decimal',
889 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI').
890 unit : string (optional)
891 the unit of the property, e.g. "METRIC:m"
892 id_ref : Id (optional)
893 allows to attached a property specifically to one element (on the
894 xml-level)
895 """
896 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
897 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation',
898 'parent_branch', 'other'))
899 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float',
900 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date',
901 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay',
902 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI',
903 'xsd:normalizedString', 'xsd:token', 'xsd:integer',
904 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int',
905 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong',
906 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte',
907 'xsd:positiveInteger'))
908
909 - def __init__(self, value, ref, applies_to, datatype,
910 unit=None, id_ref=None):
921
922
923 -class ProteinDomain(PhyloElement):
924 """Represents an individual domain in a domain architecture.
925
926 The locations use 0-based indexing, as most Python objects including
927 SeqFeature do, rather than the usual biological convention starting at 1.
928 This means the start and end attributes can be used directly as slice
929 indexes on Seq objects.
930
931 :Parameters:
932 start : non-negative integer
933 start of the domain on the sequence, using 0-based indexing
934 end : non-negative integer
935 end of the domain on the sequence
936 confidence : float
937 can be used to store e.g. E-values
938 id : string
939 unique identifier/name
940 """
941
942 - def __init__(self, value, start, end, confidence=None, id=None):
943 self.value = value
944 self.start = start
945 self.end = end
946 self.confidence = confidence
947 self.id = id
948
949 @classmethod
950 - def from_seqfeature(cls, feat):
951 return ProteinDomain(feat.id,
952 feat.location.nofuzzy_start,
953 feat.location.nofuzzy_end,
954 confidence=feat.qualifiers.get('confidence'))
955
956 - def to_seqfeature(self):
957 feat = SeqFeature(location=FeatureLocation(self.start, self.end),
958 id=self.value)
959 if hasattr(self, 'confidence'):
960 feat.qualifiers['confidence'] = self.confidence
961 return feat
962
965 """Literature reference for a clade.
966
967 NB: Whenever possible, use the ``doi`` attribute instead of the free-text
968 ``desc`` element.
969 """
970 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+')
971
972 - def __init__(self, doi=None, desc=None):
976
979 """A molecular sequence (Protein, DNA, RNA) associated with a node.
980
981 One intended use for ``id_ref`` is to link a sequence to a taxonomy (via the
982 taxonomy's ``id_source``) in case of multiple sequences and taxonomies per
983 node.
984
985 :Parameters:
986 type : {'dna', 'rna', 'protein'}
987 type of molecule this sequence represents
988 id_ref : string
989 reference to another resource
990 id_source : string
991 source for the reference
992 symbol : string
993 short symbol of the sequence, e.g. 'ACTM' (max. 10 chars)
994 accession : Accession
995 accession code for this sequence.
996 name : string
997 full name of the sequence, e.g. 'muscle Actin'
998 location
999 location of a sequence on a genome/chromosome.
1000 mol_seq : MolSeq
1001 the molecular sequence itself
1002 uri : Uri
1003 link
1004 annotations : list of Annotation objects
1005 annotations on this sequence
1006 domain_architecture : DomainArchitecture
1007 protein domains on this sequence
1008 other : list of Other objects
1009 non-phyloXML elements
1010 """
1011 alphabets = {'dna': Alphabet.generic_dna,
1012 'rna': Alphabet.generic_rna,
1013 'protein': Alphabet.generic_protein}
1014 re_symbol = re.compile(r'\S{1,10}')
1015
1016 - def __init__(self,
1017
1018 type=None, id_ref=None, id_source=None,
1019
1020 symbol=None, accession=None, name=None, location=None,
1021 mol_seq=None, uri=None, domain_architecture=None,
1022
1023 annotations=None, other=None,
1024 ):
1039
1040 @classmethod
1042 """Create a new PhyloXML Sequence from a SeqRecord object."""
1043 if is_aligned is None:
1044 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped)
1045 params = {
1046 'accession': Accession(record.id, ''),
1047 'symbol': record.name,
1048 'name': record.description,
1049 'mol_seq': MolSeq(str(record.seq), is_aligned),
1050 }
1051 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
1052 params['type'] = 'dna'
1053 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
1054 params['type'] = 'rna'
1055 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
1056 params['type'] = 'protein'
1057
1058
1059 for key in ('id_ref', 'id_source', 'location'):
1060 if key in record.annotations:
1061 params[key] = record.annotations[key]
1062 if isinstance(record.annotations.get('uri'), dict):
1063 params['uri'] = Uri(**record.annotations['uri'])
1064
1065 if record.annotations.get('annotations'):
1066 params['annotations'] = []
1067 for annot in record.annotations['annotations']:
1068 ann_args = {}
1069 for key in ('ref', 'source', 'evidence', 'type', 'desc'):
1070 if key in annot:
1071 ann_args[key] = annot[key]
1072 if isinstance(annot.get('confidence'), list):
1073 ann_args['confidence'] = Confidence(
1074 *annot['confidence'])
1075 if isinstance(annot.get('properties'), list):
1076 ann_args['properties'] = [Property(**prop)
1077 for prop in annot['properties']
1078 if isinstance(prop, dict)]
1079 params['annotations'].append(Annotation(**ann_args))
1080
1081
1082 if record.features:
1083 params['domain_architecture'] = DomainArchitecture(
1084 length=len(record.seq),
1085 domains=[ProteinDomain.from_seqfeature(feat)
1086 for feat in record.features])
1087
1088 return Sequence(**params)
1089
1091 """Create a SeqRecord object from this Sequence instance.
1092
1093 The seqrecord.annotations dictionary is packed like so::
1094
1095 { # Sequence attributes with no SeqRecord equivalent:
1096 'id_ref': self.id_ref,
1097 'id_source': self.id_source,
1098 'location': self.location,
1099 'uri': { 'value': self.uri.value,
1100 'desc': self.uri.desc,
1101 'type': self.uri.type },
1102 # Sequence.annotations attribute (list of Annotations)
1103 'annotations': [{ 'ref': ann.ref,
1104 'source': ann.source,
1105 'evidence': ann.evidence,
1106 'type': ann.type,
1107 'confidence': [ ann.confidence.value,
1108 ann.confidence.type ],
1109 'properties': [{ 'value': prop.value,
1110 'ref': prop.ref,
1111 'applies_to': prop.applies_to,
1112 'datatype': prop.datatype,
1113 'unit': prop.unit,
1114 'id_ref': prop.id_ref }
1115 for prop in ann.properties],
1116 } for ann in self.annotations],
1117 }
1118 """
1119 def clean_dict(dct):
1120 """Remove None-valued items from a dictionary."""
1121 return dict((key, val) for key, val in dct.iteritems()
1122 if val is not None)
1123
1124 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()),
1125 **clean_dict({
1126 'id': str(self.accession),
1127 'name': self.symbol,
1128 'description': self.name,
1129
1130 }))
1131 if self.domain_architecture:
1132 seqrec.features = [dom.to_seqfeature()
1133 for dom in self.domain_architecture.domains]
1134
1135 seqrec.annotations = clean_dict({
1136 'id_ref': self.id_ref,
1137 'id_source': self.id_source,
1138 'location': self.location,
1139 'uri': self.uri and clean_dict({
1140 'value': self.uri.value,
1141 'desc': self.uri.desc,
1142 'type': self.uri.type,
1143 }),
1144 'annotations': self.annotations and [
1145 clean_dict({
1146 'ref': ann.ref,
1147 'source': ann.source,
1148 'evidence': ann.evidence,
1149 'type': ann.type,
1150 'confidence': ann.confidence and [
1151 ann.confidence.value,
1152 ann.confidence.type],
1153 'properties': [clean_dict({
1154 'value': prop.value,
1155 'ref': prop.ref,
1156 'applies_to': prop.applies_to,
1157 'datatype': prop.datatype,
1158 'unit': prop.unit,
1159 'id_ref': prop.id_ref })
1160 for prop in ann.properties],
1161 }) for ann in self.annotations],
1162 })
1163 return seqrec
1164
1170
1173 """Express a typed relationship between two sequences.
1174
1175 For example, this could be used to describe an orthology (in which case
1176 attribute 'type' is 'orthology').
1177
1178 :Parameters:
1179 id_ref_0 : Id
1180 first sequence reference identifier
1181 id_ref_1 : Id
1182 second sequence reference identifier
1183 distance : float
1184 distance between the two sequences
1185 type : restricted string
1186 describe the type of relationship
1187 confidence : Confidence
1188 confidence value for this relation
1189 """
1190 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology',
1191 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other'))
1192
1193 - def __init__(self, type, id_ref_0, id_ref_1,
1194 distance=None, confidence=None):
1201
1204 """Describe taxonomic information for a clade.
1205
1206 :Parameters:
1207 id_source : Id
1208 link other elements to a taxonomy (on the XML level)
1209 id : Id
1210 unique identifier of a taxon, e.g. Id('6500',
1211 provider='ncbi_taxonomy') for the California sea hare
1212 code : restricted string
1213 store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA' for the
1214 California sea hare 'Aplysia californica'
1215 scientific_name : string
1216 the standard scientific name for this organism, e.g. 'Aplysia
1217 californica' for the California sea hare
1218 authority : string
1219 keep the authority, such as 'J. G. Cooper, 1863', associated with
1220 the 'scientific_name'
1221 common_names : list of strings
1222 common names for this organism
1223 synonyms : list of strings
1224 synonyms for this taxon?
1225 rank : restricted string
1226 taxonomic rank
1227 uri : Uri
1228 link
1229 other : list of Other objects
1230 non-phyloXML elements
1231 """
1232 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}')
1233 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom',
1234 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum',
1235 'superdivision', 'division', 'subdivision', 'infradivision',
1236 'superclass', 'class', 'subclass', 'infraclass', 'superlegion',
1237 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort',
1238 'subcohort', 'infracohort', 'superorder', 'order', 'suborder',
1239 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe',
1240 'infratribe', 'genus', 'subgenus', 'superspecies', 'species',
1241 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar',
1242 'unknown', 'other'))
1243
1244 - def __init__(self,
1245
1246 id_source=None,
1247
1248 id=None, code=None, scientific_name=None, authority=None,
1249 rank=None, uri=None,
1250
1251 common_names=None, synonyms=None, other=None,
1252 ):
1265
1267 """Show the class name and an identifying attribute."""
1268 if self.code is not None:
1269 return self.code
1270 if self.scientific_name is not None:
1271 return self.scientific_name
1272 if self.rank is not None:
1273 return self.rank
1274 if self.id is not None:
1275 return str(self.id)
1276 return self.__class__.__name__
1277
1278
1279 -class Uri(PhyloElement):
1280 """A uniform resource identifier.
1281
1282 In general, this is expected to be an URL (for example, to link to an image
1283 on a website, in which case the ``type`` attribute might be 'image' and
1284 ``desc`` might be 'image of a California sea hare').
1285 """
1286 - def __init__(self, value, desc=None, type=None):
1290
1292 if self.value:
1293 return self.value
1294 return repr(self)
1295