1
2
3
4
5
6
7
8
9 """Represent a Sequence Feature holding info about a part of a sequence.
10
11 This is heavily modeled after the Biocorba SeqFeature objects, and
12 may be pretty biased towards GenBank stuff since I'm writing it
13 for the GenBank parser output...
14
15 What's here:
16
17 Base class to hold a Feature.
18 ----------------------------
19 classes:
20 o SeqFeature
21
22 Hold information about a Reference.
23 ----------------------------------
24
25 This is an attempt to create a General class to hold Reference type
26 information.
27
28 classes:
29 o Reference
30
31 Specify locations of a feature on a Sequence.
32 ---------------------------------------------
33
34 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
35 much the same way as Biocorba. This has the advantages of allowing us
36 to handle fuzzy stuff in case anyone needs it, and also be compatible
37 with Biocorba.
38
39 classes:
40 o FeatureLocation - Specify the start and end location of a feature.
41
42 o ExactPosition - Specify the position as being exact.
43 o WithinPosition - Specify a position occuring within some range.
44 o BetweenPosition - Specify a position occuring between a range (OBSOLETE?).
45 o BeforePosition - Specify the position as being found before some base.
46 o AfterPosition - Specify the position as being found after some base.
47 o OneOfPosition - Specify a position where the location can be multiple positions.
48 o UnknownPosition - Represents missing information like '?' in UniProt.
49 """
50
51 from Bio.Seq import MutableSeq, reverse_complement
55 """Represent a Sequence Feature on an object.
56
57 Attributes:
58 o location - the location of the feature on the sequence (FeatureLocation)
59 o type - the specified type of the feature (ie. CDS, exon, repeat...)
60 o location_operator - a string specifying how this SeqFeature may
61 be related to others. For example, in the example GenBank feature
62 shown below, the location_operator would be "join"
63 o strand - A value specifying on which strand (of a DNA sequence, for
64 instance) the feature deals with. 1 indicates the plus strand, -1
65 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3),
66 while the default of None indicates that strand doesn't apply (dot in GFF3,
67 e.g. features on proteins). Note this is a shortcut for accessing the
68 strand property of the feature's location.
69 o id - A string identifier for the feature.
70 o ref - A reference to another sequence. This could be an accession
71 number for some different sequence. Note this is a shortcut for the
72 reference property of the feature's location.
73 o ref_db - A different database for the reference accession number.
74 Note this is a shortcut for the reference property of the location
75 o qualifiers - A dictionary of qualifiers on the feature. These are
76 analogous to the qualifiers from a GenBank feature table. The keys of
77 the dictionary are qualifier names, the values are the qualifier
78 values.
79 o sub_features - Additional SeqFeatures which fall under this 'parent'
80 feature. For instance, if we having something like:
81
82 CDS join(1..10,30..40,50..60)
83
84 Then the top level feature would be of type 'CDS' from 1 to 60 (actually 0
85 to 60 in Python counting) with location_operator='join', and the three sub-
86 features would also be of type 'CDS', and would be from 1 to 10, 30 to
87 40 and 50 to 60, respectively (although actually using Python counting).
88
89 To get the nucleotide sequence for this CDS, you would need to take the
90 parent sequence and do seq[0:10]+seq[29:40]+seq[49:60] (Python counting).
91 Things are more complicated with strands and fuzzy positions. To save you
92 dealing with all these special cases, the SeqFeature provides an extract
93 method to do this for you.
94 """
95 - def __init__(self, location = None, type = '', location_operator = '',
96 strand = None, id = "<unknown id>",
97 qualifiers = None, sub_features = None,
98 ref = None, ref_db = None):
99 """Initialize a SeqFeature on a Sequence.
100
101 location can either be a FeatureLocation (with strand argument also
102 given if required), or None.
103
104 e.g. With no strand, on the forward strand, and on the reverse strand:
105
106 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
107 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain")
108 >>> f1.strand == f1.location.strand == None
109 True
110 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS")
111 >>> f2.strand == f2.location.strand == +1
112 True
113 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS")
114 >>> f3.strand == f3.location.strand == -1
115 True
116
117 An invalid strand will trigger an exception:
118
119 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2)
120 Traceback (most recent call last):
121 ...
122 ValueError: Strand should be +1, -1, 0 or None, not 2
123
124 Similarly if set via the FeatureLocation directly:
125
126 >>> loc4 = FeatureLocation(50, 60, strand=2)
127 Traceback (most recent call last):
128 ...
129 ValueError: Strand should be +1, -1, 0 or None, not 2
130
131 For exact start/end positions, an integer can be used (as shown above)
132 as shorthand for the ExactPosition object. For non-exact locations, the
133 FeatureLocation must be specified via the appropriate position objects.
134 """
135 if location is not None and not isinstance(location, FeatureLocation):
136 raise TypeError("FeatureLocation (or None) required for the location")
137 self.location = location
138
139 self.type = type
140 self.location_operator = location_operator
141 if strand is not None:
142 self.strand = strand
143 self.id = id
144 if qualifiers is None:
145 qualifiers = {}
146 self.qualifiers = qualifiers
147 if sub_features is None:
148 sub_features = []
149 self.sub_features = sub_features
150 if ref is not None:
151 self.ref = ref
152 if ref_db is not None:
153 self.ref_db = ref_db
154
157
159 try:
160 self.location.strand = value
161 except AttributeError:
162 if self.location is None:
163 if value is not None:
164 raise ValueError("Can't set strand without a location.")
165 else:
166 raise
167
168 strand = property(fget = _get_strand, fset = _set_strand,
169 doc = """Feature's strand
170
171 This is a shortcut for feature.location.strand
172 """)
173
176
178 try:
179 self.location.ref = value
180 except AttributeError:
181 if self.location is None:
182 if value is not None:
183 raise ValueError("Can't set ref without a location.")
184 else:
185 raise
186 ref = property(fget = _get_ref, fset = _set_ref,
187 doc = """Feature location reference (e.g. accession).
188
189 This is a shortcut for feature.location.ref
190 """)
191
194
197 ref_db = property(fget = _get_ref_db, fset = _set_ref_db,
198 doc = """Feature location reference's database.
199
200 This is a shortcut for feature.location.ref_db
201 """)
202
204 """A string representation of the record for debugging."""
205 answer = "%s(%s" % (self.__class__.__name__, repr(self.location))
206 if self.type:
207 answer += ", type=%s" % repr(self.type)
208 if self.location_operator:
209 answer += ", location_operator=%s" % repr(self.location_operator)
210 if self.id and self.id != "<unknown id>":
211 answer += ", id=%s" % repr(self.id)
212 if self.ref:
213 answer += ", ref=%s" % repr(self.ref)
214 if self.ref_db:
215 answer += ", ref_db=%s" % repr(self.ref_db)
216 answer += ")"
217 return answer
218
220 """A readable summary of the feature intended to be printed to screen.
221 """
222 out = "type: %s\n" % self.type
223 out += "location: %s\n" % self.location
224 if self.id and self.id != "<unknown id>":
225 out += "id: %s\n" % self.id
226 out += "qualifiers: \n"
227 for qual_key in sorted(self.qualifiers):
228 out += " Key: %s, Value: %s\n" % (qual_key,
229 self.qualifiers[qual_key])
230 if len(self.sub_features) != 0:
231 out += "Sub-Features\n"
232 for sub_feature in self.sub_features:
233 out +="%s\n" % sub_feature
234 return out
235
237 """Returns a copy of the feature with its location shifted (PRIVATE).
238
239 The annotation qaulifiers are copied."""
240 return SeqFeature(location = self.location._shift(offset),
241 type = self.type,
242 location_operator = self.location_operator,
243 id = self.id,
244 qualifiers = dict(self.qualifiers.iteritems()),
245 sub_features = [f._shift(offset) for f in self.sub_features])
246
247 - def _flip(self, length):
248 """Returns a copy of the feature with its location flipped (PRIVATE).
249
250 The argument length gives the length of the parent sequence. For
251 example a location 0..20 (+1 strand) with parent length 30 becomes
252 after flipping 10..30 (-1 strand). Strandless (None) or unknown
253 strand (0) remain like that - just their end points are changed.
254
255 The annotation qaulifiers are copied.
256 """
257 return SeqFeature(location = self.location._flip(length),
258 type = self.type,
259 location_operator = self.location_operator,
260 id = self.id,
261 qualifiers = dict(self.qualifiers.iteritems()),
262 sub_features = [f._flip(length) for f in self.sub_features[::-1]])
263
265 """Extract feature sequence from the supplied parent sequence.
266
267 The parent_sequence can be a Seq like object or a string, and will
268 generally return an object of the same type. The exception to this is
269 a MutableSeq as the parent sequence will return a Seq object.
270
271 This should cope with complex locations including complements, joins
272 and fuzzy positions. Even mixed strand features should work! This
273 also covers features on protein sequences (e.g. domains), although
274 here reverse strand features are not permitted.
275
276 >>> from Bio.Seq import Seq
277 >>> from Bio.Alphabet import generic_protein
278 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
279 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein)
280 >>> f = SeqFeature(FeatureLocation(8,15), type="domain")
281 >>> f.extract(seq)
282 Seq('VALIVIC', ProteinAlphabet())
283
284 Note - currently only sub-features of type "join" are supported.
285 """
286 if isinstance(parent_sequence, MutableSeq):
287
288
289 parent_sequence = parent_sequence.toseq()
290 if self.sub_features:
291 if self.location_operator != "join":
292 raise ValueError(self.location_operator)
293 if self.location.strand == -1:
294
295
296 parts = []
297 for f_sub in self.sub_features[::-1]:
298 assert f_sub.location.strand == -1
299 parts.append(f_sub.location.extract(parent_sequence))
300 else:
301
302 parts = [f_sub.location.extract(parent_sequence)
303 for f_sub in self.sub_features]
304
305 f_seq = parts[0]
306 for part in parts[1:]:
307 f_seq += part
308 return f_seq
309 else:
310 return self.location.extract(parent_sequence)
311
313 """Returns True regardless of the length of the feature.
314
315 This behaviour is for backwards compatibility, since until the
316 __len__ method was added, a SeqFeature always evaluated as True.
317
318 Note that in comparison, Seq objects, strings, lists, etc, will all
319 evaluate to False if they have length zero.
320
321 WARNING: The SeqFeature may in future evaluate to False when its
322 length is zero (in order to better match normal python behaviour)!
323 """
324 return True
325
327 """Returns the length of the region described by a feature.
328
329 >>> from Bio.Seq import Seq
330 >>> from Bio.Alphabet import generic_protein
331 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
332 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein)
333 >>> f = SeqFeature(FeatureLocation(8,15), type="domain")
334 >>> len(f)
335 7
336 >>> f.extract(seq)
337 Seq('VALIVIC', ProteinAlphabet())
338 >>> len(f.extract(seq))
339 7
340
341 For simple features without subfeatures this is the same as the region
342 spanned (end position minus start position). However, for a feature
343 defined by combining several subfeatures (e.g. a CDS as the join of
344 several exons) the gaps are not counted (e.g. introns). This ensures
345 that len(f) == len(f.extract(parent_seq)), and also makes sure things
346 work properly with features wrapping the origin etc.
347 """
348 if self.sub_features:
349 return sum(len(f) for f in self.sub_features)
350 else:
351 return len(self.location)
352
354 """Iterate over the parent positions within the feature.
355
356 The iteration order is strand aware, and can be thought of as moving
357 along the feature using the parent sequence coordinates:
358
359 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
360 >>> f = SeqFeature(FeatureLocation(5,10), type="domain", strand=-1)
361 >>> len(f)
362 5
363 >>> for i in f: print i
364 9
365 8
366 7
367 6
368 5
369 >>> list(f)
370 [9, 8, 7, 6, 5]
371 """
372 if self.sub_features:
373 if self.strand == -1:
374 for f in self.sub_features[::-1]:
375 for i in f.location:
376 yield i
377 else:
378 for f in self.sub_features:
379 for i in f.location:
380 yield i
381 else:
382 for i in self.location:
383 yield i
384
386 """Check if an integer position is within the feature.
387
388 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
389 >>> f = SeqFeature(FeatureLocation(5,10), type="domain", strand=-1)
390 >>> len(f)
391 5
392 >>> [i for i in range(15) if i in f]
393 [5, 6, 7, 8, 9]
394
395 For example, to see which features include a SNP position, you could
396 use this:
397
398 >>> from Bio import SeqIO
399 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb")
400 >>> for f in record.features:
401 ... if 1750 in f:
402 ... print f.type, f.location
403 source [0:154478](+)
404 gene [1716:4347](-)
405 tRNA [1716:4347](-)
406
407 Note that for a feature defined as a join of several subfeatures (e.g.
408 the union of several exons) the gaps are not checked (e.g. introns).
409 In this example, the tRNA location is defined in the GenBank file as
410 complement(join(1717..1751,4311..4347)), so that position 1760 falls
411 in the gap:
412
413 >>> for f in record.features:
414 ... if 1760 in f:
415 ... print f.type, f.location
416 source [0:154478](+)
417 gene [1716:4347](-)
418
419 Note that additional care may be required with fuzzy locations, for
420 example just before a BeforePosition:
421
422 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
423 >>> from Bio.SeqFeature import BeforePosition
424 >>> f = SeqFeature(FeatureLocation(BeforePosition(3),8), type="domain")
425 >>> len(f)
426 5
427 >>> [i for i in range(10) if i in f]
428 [3, 4, 5, 6, 7]
429 """
430 if not isinstance(value, int):
431 raise ValueError("Currently we only support checking for integer "
432 "positions being within a SeqFeature.")
433 if self.sub_features:
434 for f in self.sub_features:
435 if value in f:
436 return True
437 return False
438 else:
439 return value in self.location
440
446 """Represent a Generic Reference object.
447
448 Attributes:
449 o location - A list of Location objects specifying regions of
450 the sequence that the references correspond to. If no locations are
451 specified, the entire sequence is assumed.
452 o authors - A big old string, or a list split by author, of authors
453 for the reference.
454 o title - The title of the reference.
455 o journal - Journal the reference was published in.
456 o medline_id - A medline reference for the article.
457 o pubmed_id - A pubmed reference for the article.
458 o comment - A place to stick any comments about the reference.
459 """
469
471 """Output an informative string for debugging.
472 """
473 out = ""
474 for single_location in self.location:
475 out += "location: %s\n" % single_location
476 out += "authors: %s\n" % self.authors
477 if self.consrtm:
478 out += "consrtm: %s\n" % self.consrtm
479 out += "title: %s\n" % self.title
480 out += "journal: %s\n" % self.journal
481 out += "medline id: %s\n" % self.medline_id
482 out += "pubmed id: %s\n" % self.pubmed_id
483 out += "comment: %s\n" % self.comment
484 return out
485
487
488 return "%s(title=%s, ...)" % (self.__class__.__name__,
489 repr(self.title))
490
495 """Specify the location of a feature along a sequence.
496
497 The FeatureLocation is used for simple continous features, which can
498 be described as running from a start position to and end position
499 (optionally with a strand and reference information). More complex
500 locations made up from several non-continuous parts (e.g. a coding
501 sequence made up of several exons) are currently described using a
502 SeqFeature with sub-features.
503
504 Note that the start and end location numbering follow Python's scheme,
505 thus a GenBank entry of 123..150 (one based counting) becomes a location
506 of [122:150] (zero based counting).
507
508 >>> from Bio.SeqFeature import FeatureLocation
509 >>> f = FeatureLocation(122, 150)
510 >>> print f
511 [122:150]
512 >>> print f.start
513 122
514 >>> print f.end
515 150
516 >>> print f.strand
517 None
518
519 Note the strand defaults to None. If you are working with nucleotide
520 sequences you'd want to be explicit if it is the forward strand:
521
522 >>> from Bio.SeqFeature import FeatureLocation
523 >>> f = FeatureLocation(122, 150, strand=+1)
524 >>> print f
525 [122:150](+)
526 >>> print f.strand
527 1
528
529 Note that for a parent sequence of length n, the FeatureLocation
530 start and end must satisfy the inequality 0 <= start <= end <= n.
531 This means even for features on the reverse strand of a nucleotide
532 sequence, we expect the 'start' coordinate to be less than the
533 'end' coordinate.
534
535 >>> from Bio.SeqFeature import FeatureLocation
536 >>> r = FeatureLocation(122, 150, strand=-1)
537 >>> print r
538 [122:150](-)
539 >>> print r.start
540 122
541 >>> print r.end
542 150
543 >>> print r.strand
544 -1
545
546 i.e. Rather than thinking of the 'start' and 'end' biologically in a
547 strand aware manor, think of them as the 'left most' or 'minimum'
548 boundary, and the 'right most' or 'maximum' boundary of the region
549 being described.
550
551 In the example above we have used standard exact positions, but there
552 are also specialised position objects used to represent fuzzy positions
553 as well, for example a GenBank location like complement(<123..150)
554 would use a BeforePosition object for the start.
555 """
556 - def __init__(self, start, end, strand=None, ref=None, ref_db=None):
557 """Specify the start, end, strand etc of a sequence feature.
558
559 start and end arguments specify the values where the feature begins
560 and ends. These can either by any of the *Position objects that
561 inherit from AbstractPosition, or can just be integers specifying the
562 position. In the case of integers, the values are assumed to be
563 exact and are converted in ExactPosition arguments. This is meant
564 to make it easy to deal with non-fuzzy ends.
565
566 i.e. Short form:
567
568 >>> from Bio.SeqFeature import FeatureLocation
569 >>> loc = FeatureLocation(5, 10, strand=-1)
570 >>> print loc
571 [5:10](-)
572
573 Explicit form:
574
575 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition
576 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1)
577 >>> print loc
578 [5:10](-)
579
580 Other fuzzy positions are used similarly,
581
582 >>> from Bio.SeqFeature import FeatureLocation
583 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
584 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1)
585 >>> print loc2
586 [<5:>10](-)
587
588 For nucleotide features you will also want to specify the strand,
589 use 1 for the forward (plus) strand, -1 for the reverse (negative)
590 strand, 0 for stranded but strand unknown (? in GFF3), or None for
591 when the strand does not apply (dot in GFF3), e.g. features on
592 proteins.
593
594 >>> loc = FeatureLocation(5, 10, strand=+1)
595 >>> print loc
596 [5:10](+)
597 >>> print loc.strand
598 1
599
600 Normally feature locations are given relative to the parent
601 sequence you are working with, but an explicit accession can
602 be given with the optional ref and db_ref strings:
603
604 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1)
605 >>> print loc
606 AL391218.9[105172:108462](+)
607 >>> print loc.ref
608 AL391218.9
609
610 """
611
612 if isinstance(start, AbstractPosition):
613 self._start = start
614 elif isinstance(start, int) or isinstance(start, long):
615 self._start = ExactPosition(start)
616 else:
617 raise TypeError("start=%r %s" % (start, type(start)))
618 if isinstance(end, AbstractPosition):
619 self._end = end
620 elif isinstance(end, int) or isinstance(end, long):
621 self._end = ExactPosition(end)
622 else:
623 raise TypeError("end=%r %s" % (end, type(end)))
624 self.strand = strand
625 self.ref = ref
626 self.ref_db = ref_db
627
630
632 if value not in [+1, -1, 0, None]:
633 raise ValueError("Strand should be +1, -1, 0 or None, not %r"
634 % value)
635 self._strand = value
636
637 strand = property(fget = _get_strand, fset = _set_strand,
638 doc = "Strand of the location (+1, -1, 0 or None).")
639
641 """Returns a representation of the location (with python counting).
642
643 For the simple case this uses the python splicing syntax, [122:150]
644 (zero based counting) which GenBank would call 123..150 (one based
645 counting).
646 """
647 answer = "[%s:%s]" % (self._start, self._end)
648 if self.ref and self.ref_db:
649 answer = "%s:%s%s" % (self.ref_db, self.ref, answer)
650 elif self.ref:
651 answer = self.ref + answer
652
653 if self.strand is None:
654 return answer
655 elif self.strand == +1:
656 return answer + "(+)"
657 elif self.strand == -1:
658 return answer + "(-)"
659 else:
660
661 return answer + "(?)"
662
664 """A string representation of the location for debugging."""
665 optional = ""
666 if self.strand is not None:
667 optional += ", strand=%r" % self.strand
668 if self.ref is not None:
669 optional += ", ref=%r" % self.ref
670 if self.ref_db is not None:
671 optional += ", ref_db=%r" % self.ref_db
672 return "%s(%r, %r%s)" \
673 % (self.__class__.__name__, self.start, self.end, optional)
674
676 """Returns True regardless of the length of the feature.
677
678 This behaviour is for backwards compatibility, since until the
679 __len__ method was added, a FeatureLocation always evaluated as True.
680
681 Note that in comparison, Seq objects, strings, lists, etc, will all
682 evaluate to False if they have length zero.
683
684 WARNING: The FeatureLocation may in future evaluate to False when its
685 length is zero (in order to better match normal python behaviour)!
686 """
687 return True
688
690 """Returns the length of the region described by the FeatureLocation.
691
692 Note that extra care may be needed for fuzzy locations, e.g.
693
694 >>> from Bio.SeqFeature import FeatureLocation
695 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
696 >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10))
697 >>> len(loc)
698 5
699 """
700 return int(self._end) - int(self._start)
701
703 """Check if an integer position is within the FeatureLocation.
704
705 Note that extra care may be needed for fuzzy locations, e.g.
706
707 >>> from Bio.SeqFeature import FeatureLocation
708 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
709 >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10))
710 >>> len(loc)
711 5
712 >>> [i for i in range(15) if i in loc]
713 [5, 6, 7, 8, 9]
714 """
715 if not isinstance(value, int):
716 raise ValueError("Currently we only support checking for integer "
717 "positions being within a FeatureLocation.")
718 if value < self._start or value >= self._end:
719 return False
720 else:
721 return True
722
724 """Iterate over the parent positions within the FeatureLocation.
725
726 >>> from Bio.SeqFeature import FeatureLocation
727 >>> from Bio.SeqFeature import BeforePosition, AfterPosition
728 >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10))
729 >>> len(loc)
730 5
731 >>> for i in loc: print i
732 5
733 6
734 7
735 8
736 9
737 >>> list(loc)
738 [5, 6, 7, 8, 9]
739 >>> [i for i in range(15) if i in loc]
740 [5, 6, 7, 8, 9]
741
742 Note this is strand aware:
743
744 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1)
745 >>> list(loc)
746 [9, 8, 7, 6, 5]
747 """
748 if self.strand == -1:
749 for i in range(self._end - 1, self._start - 1, -1):
750 yield i
751 else:
752 for i in range(self._start, self._end):
753 yield i
754
763
764 - def _flip(self, length):
765 """Returns a copy of the location after the parent is reversed (PRIVATE)."""
766 if self.ref or self.ref_db:
767
768 raise ValueError("Feature references another sequence.")
769
770 if self.strand == +1:
771 flip_strand = -1
772 elif self.strand == -1:
773 flip_strand = +1
774 else:
775
776 flip_strand = self.strand
777 return FeatureLocation(start = self._end._flip(length),
778 end = self._start._flip(length),
779 strand = flip_strand)
780
781 @property
783 """Start location (integer like, possibly a fuzzy position, read only)."""
784 return self._start
785
786 @property
788 """End location (integer like, possibly a fuzzy position, read only)."""
789 return self._end
790
791 @property
793 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE).
794
795 This is now a alias for int(feature.start), which should be
796 used in preference -- unless you are trying to support old
797 versions of Biopython.
798 """
799 return int(self._start)
800
801 @property
803 """End position (integer, approximated if fuzzy, read only) (OBSOLETE).
804
805 This is now a alias for int(feature.end), which should be
806 used in preference -- unless you are trying to support old
807 versions of Biopython.
808 """
809 return int(self._end)
810
812 """Extract feature sequence from the supplied parent sequence."""
813 if self.ref or self.ref_db:
814
815 raise ValueError("Feature references another sequence.")
816 if isinstance(parent_sequence, MutableSeq):
817
818
819 parent_sequence = parent_sequence.toseq()
820 f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end]
821 if self.strand == -1:
822 try:
823 f_seq = f_seq.reverse_complement()
824 except AttributeError:
825 assert isinstance(f_seq, str)
826 f_seq = reverse_complement(f_seq)
827 return f_seq
828
831 """Abstract base class representing a position.
832 """
833
835 """String representation of the location for debugging."""
836 return "%s(...)" % (self.__class__.__name__)
837
840 """Specify the specific position of a boundary.
841
842 o position - The position of the boundary.
843 o extension - An optional argument which must be zero since we don't
844 have an extension. The argument is provided so that the same number of
845 arguments can be passed to all position types.
846
847 In this case, there is no fuzziness associated with the position.
848
849 >>> p = ExactPosition(5)
850 >>> p
851 ExactPosition(5)
852 >>> print p
853 5
854
855 >>> isinstance(p, AbstractPosition)
856 True
857 >>> isinstance(p, int)
858 True
859
860 Integer comparisons and operations should work as expected:
861
862 >>> p == 5
863 True
864 >>> p < 6
865 True
866 >>> p <= 5
867 True
868 >>> p + 10
869 15
870
871 """
872 - def __new__(cls, position, extension = 0):
877
879 """String representation of the ExactPosition location for debugging."""
880 return "%s(%i)" % (self.__class__.__name__, int(self))
881
882 @property
884 """Legacy attribute to get position as integer (OBSOLETE)."""
885 return int(self)
886
887 @property
889 """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
890 return 0
891
893
894 return self.__class__(int(self) + offset)
895
896 - def _flip(self, length):
897
898 return self.__class__(length - int(self))
899
902 """Specify a specific position which is uncertain.
903
904 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the
905 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL.
906 """
907 pass
908
911 """Specify a specific position which is unknown (has no position).
912
913 This is used in UniProt, e.g. ? or in the XML as unknown.
914 """
915
917 """String representation of the UnknownPosition location for debugging."""
918 return "%s()" % self.__class__.__name__
919
922
923 @property
925 """Legacy attribute to get position (None) (OBSOLETE)."""
926 return None
927
928 @property
930 """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
931 return 0
932
935
936 - def _flip(self, length):
938
941 """Specify the position of a boundary within some coordinates.
942
943 Arguments:
944 o position - The default integer position
945 o left - The start (left) position of the boundary
946 o right - The end (right) position of the boundary
947
948 This allows dealing with a position like ((1.4)..100). This
949 indicates that the start of the sequence is somewhere between 1
950 and 4. Since this is a start coordinate, it should acts like
951 it is at position 1 (or in Python counting, 0).
952
953 >>> p = WithinPosition(10,10,13)
954 >>> p
955 WithinPosition(10, left=10, right=13)
956 >>> print p
957 (10.13)
958 >>> int(p)
959 10
960
961 Basic integer comparisons and operations should work as though
962 this were a plain integer:
963
964 >>> p == 10
965 True
966 >>> p in [9,10,11]
967 True
968 >>> p < 11
969 True
970 >>> p + 10
971 20
972
973 >>> isinstance(p, WithinPosition)
974 True
975 >>> isinstance(p, AbstractPosition)
976 True
977 >>> isinstance(p, int)
978 True
979
980 Note this also applies for comparison to other position objects,
981 where again the integer behaviour is used:
982
983 >>> p == 10
984 True
985 >>> p == ExactPosition(10)
986 True
987 >>> p == BeforePosition(10)
988 True
989 >>> p == AfterPosition(10)
990 True
991
992 If this were an end point, you would want the position to be 13:
993
994 >>> p2 = WithinPosition(13,10,13)
995 >>> p2
996 WithinPosition(13, left=10, right=13)
997 >>> print p2
998 (10.13)
999 >>> int(p2)
1000 13
1001 >>> p2 == 13
1002 True
1003 >>> p2 == ExactPosition(13)
1004 True
1005
1006 The old legacy properties of position and extension give the
1007 starting/lower/left position as an integer, and the distance
1008 to the ending/higher/right position as an integer. Note that
1009 the position object will act like either the left or the right
1010 end-point depending on how it was created:
1011
1012 >>> p.position == p2.position == 10
1013 True
1014 >>> p.extension == p2.extension == 3
1015 True
1016 >>> int(p) == int(p2)
1017 False
1018 >>> p == 10
1019 True
1020 >>> p2 == 13
1021 True
1022
1023 """
1024 - def __new__(cls, position, left, right):
1025 assert position==left or position==right, \
1026 "WithinPosition: %r should match left %r or right %r" \
1027 (position, left, right)
1028 obj = int.__new__(cls, position)
1029 obj._left = left
1030 obj._right = right
1031 return obj
1032
1034 """String representation of the WithinPosition location for debugging."""
1035 return "%s(%i, left=%i, right=%i)" \
1036 % (self.__class__.__name__, int(self),
1037 self._left, self._right)
1038
1040 return "(%s.%s)" % (self._left, self._right)
1041
1042 @property
1044 """Legacy attribute to get (left) position as integer (OBSOLETE)."""
1045 return self._left
1046
1047 @property
1049 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""
1050 return self._right - self._left
1051
1053 return self.__class__(int(self) + offset,
1054 self._left + offset,
1055 self._right + offset)
1056
1057 - def _flip(self, length):
1058 return self.__class__(length - int(self),
1059 length - self._right,
1060 length - self._left)
1061
1064 """Specify the position of a boundary between two coordinates (OBSOLETE?).
1065
1066 Arguments:
1067 o position - The default integer position
1068 o left - The start (left) position of the boundary
1069 o right - The end (right) position of the boundary
1070
1071 This allows dealing with a position like 123^456. This
1072 indicates that the start of the sequence is somewhere between
1073 123 and 456. It is up to the parser to set the position argument
1074 to either boundary point (depending on if this is being used as
1075 a start or end of the feature). For example as a feature end:
1076
1077 >>> p = BetweenPosition(456, 123, 456)
1078 >>> p
1079 BetweenPosition(456, left=123, right=456)
1080 >>> print p
1081 (123^456)
1082 >>> int(p)
1083 456
1084
1085 Integer equality and comparison use the given position,
1086
1087 >>> p == 456
1088 True
1089 >>> p in [455, 456, 457]
1090 True
1091 >>> p > 300
1092 True
1093
1094 The old legacy properties of position and extension give the
1095 starting/lower/left position as an integer, and the distance
1096 to the ending/higher/right position as an integer. Note that
1097 the position object will act like either the left or the right
1098 end-point depending on how it was created:
1099
1100 >>> p2 = BetweenPosition(123, left=123, right=456)
1101 >>> p.position == p2.position == 123
1102 True
1103 >>> p.extension
1104 333
1105 >>> p2.extension
1106 333
1107 >>> p.extension == p2.extension == 333
1108 True
1109 >>> int(p) == int(p2)
1110 False
1111 >>> p == 456
1112 True
1113 >>> p2 == 123
1114 True
1115
1116 Note this potentially surprising behaviour:
1117
1118 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123)
1119 True
1120 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123)
1121 True
1122 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123)
1123 True
1124
1125 i.e. For equality (and sorting) the position objects behave like
1126 integers.
1127 """
1128 - def __new__(cls, position, left, right):
1134
1136 """String representation of the WithinPosition location for debugging."""
1137 return "%s(%i, left=%i, right=%i)" \
1138 % (self.__class__.__name__, int(self),
1139 self._left, self._right)
1140
1142 return "(%s^%s)" % (self._left, self._right)
1143
1144 @property
1146 """Legacy attribute to get (left) position as integer (OBSOLETE)."""
1147 return self._left
1148
1149 @property
1151 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE)."""
1152 return self._right - self._left
1153
1155 return self.__class__(int(self) + offset,
1156 self._left + offset,
1157 self._right + offset)
1158
1159 - def _flip(self, length):
1160 return self.__class__(length - int(self),
1161 length - self._right,
1162 length - self._left)
1163
1166 """Specify a position where the actual location occurs before it.
1167
1168 Arguments:
1169 o position - The upper boundary of where the location can occur.
1170 o extension - An optional argument which must be zero since we don't
1171 have an extension. The argument is provided so that the same number of
1172 arguments can be passed to all position types.
1173
1174 This is used to specify positions like (<10..100) where the location
1175 occurs somewhere before position 10.
1176
1177 >>> p = BeforePosition(5)
1178 >>> p
1179 BeforePosition(5)
1180 >>> print p
1181 <5
1182 >>> int(p)
1183 5
1184 >>> p + 10
1185 15
1186
1187 Note this potentially surprising behaviour:
1188
1189 >>> p == ExactPosition(5)
1190 True
1191 >>> p == AfterPosition(5)
1192 True
1193
1194 Just remember that for equality and sorting the position objects act
1195 like integers.
1196 """
1197
1198 - def __new__(cls, position, extension = 0):
1203
1204 @property
1206 """Legacy attribute to get position as integer (OBSOLETE)."""
1207 return int(self)
1208
1209 @property
1211 """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
1212 return 0
1213
1215 """A string representation of the location for debugging."""
1216 return "%s(%i)" % (self.__class__.__name__, int(self))
1217
1220
1222 return self.__class__(int(self) + offset)
1223
1224 - def _flip(self, length):
1226
1229 """Specify a position where the actual location is found after it.
1230
1231 Arguments:
1232 o position - The lower boundary of where the location can occur.
1233 o extension - An optional argument which must be zero since we don't
1234 have an extension. The argument is provided so that the same number of
1235 arguments can be passed to all position types.
1236
1237 This is used to specify positions like (>10..100) where the location
1238 occurs somewhere after position 10.
1239
1240 >>> p = AfterPosition(7)
1241 >>> p
1242 AfterPosition(7)
1243 >>> print p
1244 >7
1245 >>> int(p)
1246 7
1247 >>> p + 10
1248 17
1249
1250 >>> isinstance(p, AfterPosition)
1251 True
1252 >>> isinstance(p, AbstractPosition)
1253 True
1254 >>> isinstance(p, int)
1255 True
1256
1257 Note this potentially surprising behaviour:
1258
1259 >>> p == ExactPosition(7)
1260 True
1261 >>> p == BeforePosition(7)
1262 True
1263
1264 Just remember that for equality and sorting the position objects act
1265 like integers.
1266 """
1267
1268 - def __new__(cls, position, extension = 0):
1273
1274 @property
1276 """Legacy attribute to get position as integer (OBSOLETE)."""
1277 return int(self)
1278
1279 @property
1281 """Legacy attribute to get extension (zero) as integer (OBSOLETE)."""
1282 return 0
1283
1285 """A string representation of the location for debugging."""
1286 return "%s(%i)" % (self.__class__.__name__, int(self))
1287
1290
1292 return self.__class__(int(self) + offset)
1293
1294 - def _flip(self, length):
1296
1299 """Specify a position where the location can be multiple positions.
1300
1301 This models the GenBank 'one-of(1888,1901)' function, and tries
1302 to make this fit within the Biopython Position models. If this was
1303 a start position it should act like 1888, but as an end position 1901.
1304
1305 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)])
1306 >>> p
1307 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)])
1308 >>> int(p)
1309 1888
1310
1311 Interget comparisons and operators act like using int(p),
1312
1313 >>> p == 1888
1314 True
1315 >>> p <= 1888
1316 True
1317 >>> p > 1888
1318 False
1319 >>> p + 100
1320 1988
1321
1322 >>> isinstance(p, OneOfPosition)
1323 True
1324 >>> isinstance(p, AbstractPosition)
1325 True
1326 >>> isinstance(p, int)
1327 True
1328
1329 The old legacy properties of position and extension give the
1330 starting/lowest/left-most position as an integer, and the
1331 distance to the ending/highest/right-most position as an integer.
1332 Note that the position object will act like one of the list of
1333 possible locations depending on how it was created:
1334
1335 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)])
1336 >>> p.position == p2.position == 1888
1337 True
1338 >>> p.extension == p2.extension == 13
1339 True
1340 >>> int(p) == int(p2)
1341 False
1342 >>> p == 1888
1343 True
1344 >>> p2 == 1901
1345 True
1346
1347 """
1348 - def __new__(cls, position, choices):
1349 """Initialize with a set of posssible positions.
1350
1351 position_list is a list of AbstractPosition derived objects,
1352 specifying possible locations.
1353
1354 position is an integer specifying the default behaviour.
1355 """
1356 assert position in choices, \
1357 "OneOfPosition: %r should match one of %r" % (position, choices)
1358 obj = int.__new__(cls, position)
1359 obj.position_choices = choices
1360 return obj
1361
1362 @property
1364 """Legacy attribute to get (left) position as integer (OBSOLETE)."""
1365 return min(int(pos) for pos in self.position_choices)
1366
1367 @property
1369 """Legacy attribute to get extension as integer (OBSOLETE)."""
1370 positions = [int(pos) for pos in self.position_choices]
1371 return max(positions) - min(positions)
1372
1374 """String representation of the OneOfPosition location for debugging."""
1375 return "%s(%i, choices=%r)" % (self.__class__.__name__,
1376 int(self), self.position_choices)
1377
1379 out = "one-of("
1380 for position in self.position_choices:
1381 out += "%s," % position
1382
1383 out = out[:-1] + ")"
1384 return out
1385
1387 return self.__class__(int(self) + offset,
1388 [p._shift(offset) for p in self.position_choices])
1389
1390 - def _flip(self, length):
1391 return self.__class__(length - int(self),
1392 [p._flip(length) for p in self.position_choices[::-1]])
1393
1396 """Simple class to hold information about a gap between positions.
1397 """
1399 """Intialize with a position object containing the gap information.
1400 """
1401 self.gap_size = gap_size
1402
1404 """A string representation of the position gap for debugging."""
1405 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
1406
1408 out = "gap(%s)" % self.gap_size
1409 return out
1410
1411
1412 if __name__ == "__main__":
1413 from Bio._utils import run_doctest
1414 run_doctest()
1415