Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

   1  # Copyright 2000-2003 Jeff Chang. 
   2  # Copyright 2001-2008 Brad Chapman. 
   3  # Copyright 2005-2015 by Peter Cock. 
   4  # Copyright 2006-2009 Michiel de Hoon. 
   5  # All rights reserved. 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  """Represent a Sequence Feature holding info about a part of a sequence. 
  10   
  11  This is heavily modeled after the Biocorba SeqFeature objects, and 
  12  may be pretty biased towards GenBank stuff since I'm writing it 
  13  for the GenBank parser output... 
  14   
  15  What's here: 
  16   
  17  Base class to hold a Feature 
  18  ---------------------------- 
  19   
  20  classes: 
  21   
  22      - SeqFeature 
  23   
  24  Hold information about a Reference 
  25  ---------------------------------- 
  26   
  27  This is an attempt to create a General class to hold Reference type 
  28  information. 
  29   
  30  classes: 
  31   
  32      - Reference 
  33   
  34  Specify locations of a feature on a Sequence 
  35  -------------------------------------------- 
  36   
  37  This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. 
  38  This has the advantages of allowing us to handle fuzzy stuff in case anyone 
  39  needs it, and also be compatible with BioPerl etc and BioSQL. 
  40   
  41  classes: 
  42   
  43      - FeatureLocation - Specify the start and end location of a feature. 
  44      - CompoundLocation - Collection of FeatureLocation objects (for joins etc). 
  45   
  46      - ExactPosition - Specify the position as being exact. 
  47      - WithinPosition - Specify a position occurring within some range. 
  48      - BetweenPosition - Specify a position occurring between a range (OBSOLETE?). 
  49      - BeforePosition - Specify the position as being found before some base. 
  50      - AfterPosition - Specify the position as being found after some base. 
  51      - OneOfPosition - Specify a position where the location can be multiple positions. 
  52      - UnknownPosition - Represents missing information like '?' in UniProt. 
  53  """ 
  54   
  55  from __future__ import print_function 
  56   
  57  from Bio._py3k import _is_int_or_long 
  58   
  59  from Bio.Seq import MutableSeq, reverse_complement 
60 61 62 -class SeqFeature(object):
63 """Represent a Sequence Feature on an object. 64 65 Attributes: 66 67 - location - the location of the feature on the sequence (FeatureLocation) 68 - type - the specified type of the feature (ie. CDS, exon, repeat...) 69 - location_operator - a string specifying how this SeqFeature may 70 be related to others. For example, in the example GenBank feature 71 shown below, the location_operator would be "join". This is a proxy 72 for feature.location.operator and only applies to compound locations. 73 - strand - A value specifying on which strand (of a DNA sequence, for 74 instance) the feature deals with. 1 indicates the plus strand, -1 75 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), 76 while the default of None indicates that strand doesn't apply (dot in GFF3, 77 e.g. features on proteins). Note this is a shortcut for accessing the 78 strand property of the feature's location. 79 - id - A string identifier for the feature. 80 - ref - A reference to another sequence. This could be an accession 81 number for some different sequence. Note this is a shortcut for the 82 reference property of the feature's location. 83 - ref_db - A different database for the reference accession number. 84 Note this is a shortcut for the reference property of the location 85 - qualifiers - A dictionary of qualifiers on the feature. These are 86 analogous to the qualifiers from a GenBank feature table. The keys of 87 the dictionary are qualifier names, the values are the qualifier 88 values. 89 """ 90
91 - def __init__(self, location=None, type='', location_operator='', 92 strand=None, id="<unknown id>", 93 qualifiers=None, sub_features=None, 94 ref=None, ref_db=None):
95 """Initialize a SeqFeature on a Sequence. 96 97 location can either be a FeatureLocation (with strand argument also 98 given if required), or None. 99 100 e.g. With no strand, on the forward strand, and on the reverse strand: 101 102 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 103 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain") 104 >>> f1.strand == f1.location.strand == None 105 True 106 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS") 107 >>> f2.strand == f2.location.strand == +1 108 True 109 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS") 110 >>> f3.strand == f3.location.strand == -1 111 True 112 113 An invalid strand will trigger an exception: 114 115 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2) 116 Traceback (most recent call last): 117 ... 118 ValueError: Strand should be +1, -1, 0 or None, not 2 119 120 Similarly if set via the FeatureLocation directly: 121 122 >>> loc4 = FeatureLocation(50, 60, strand=2) 123 Traceback (most recent call last): 124 ... 125 ValueError: Strand should be +1, -1, 0 or None, not 2 126 127 For exact start/end positions, an integer can be used (as shown above) 128 as shorthand for the ExactPosition object. For non-exact locations, the 129 FeatureLocation must be specified via the appropriate position objects. 130 131 Note that the strand, ref and ref_db arguments to the SeqFeature are 132 now obsolete and will be deprecated in a future release (which will 133 give warning messages) and later removed. Set them via the location 134 object instead. 135 136 Note that location_operator and sub_features arguments can no longer 137 be used, instead do this via the CompoundLocation object. 138 """ 139 if location is not None and not isinstance(location, FeatureLocation) \ 140 and not isinstance(location, CompoundLocation): 141 raise TypeError( 142 "FeatureLocation, CompoundLocation (or None) required for the location") 143 self.location = location 144 self.type = type 145 if location_operator: 146 # TODO - Deprecation warning 147 self.location_operator = location_operator 148 if strand is not None: 149 # TODO - Deprecation warning 150 self.strand = strand 151 self.id = id 152 if qualifiers is None: 153 qualifiers = {} 154 self.qualifiers = qualifiers 155 if sub_features is not None: 156 raise TypeError("Rather than sub_features, use a CompoundFeatureLocation") 157 if ref is not None: 158 # TODO - Deprecation warning 159 self.ref = ref 160 if ref_db is not None: 161 # TODO - Deprecation warning 162 self.ref_db = ref_db
163
164 - def _get_strand(self):
165 return self.location.strand
166
167 - def _set_strand(self, value):
168 try: 169 self.location.strand = value 170 except AttributeError: 171 if self.location is None: 172 if value is not None: 173 raise ValueError("Can't set strand without a location.") 174 else: 175 raise
176 177 strand = property(fget=_get_strand, fset=_set_strand, 178 doc="""Feature's strand 179 180 This is a shortcut for feature.location.strand 181 """) 182
183 - def _get_ref(self):
184 try: 185 return self.location.ref 186 except AttributeError: 187 return None
188
189 - def _set_ref(self, value):
190 try: 191 self.location.ref = value 192 except AttributeError: 193 if self.location is None: 194 if value is not None: 195 raise ValueError("Can't set ref without a location.") 196 else: 197 raise
198 ref = property(fget=_get_ref, fset=_set_ref, 199 doc="""Feature location reference (e.g. accession). 200 201 This is a shortcut for feature.location.ref 202 """) 203
204 - def _get_ref_db(self):
205 try: 206 return self.location.ref_db 207 except AttributeError: 208 return None
209
210 - def _set_ref_db(self, value):
211 self.location.ref_db = value
212 ref_db = property(fget=_get_ref_db, fset=_set_ref_db, 213 doc="""Feature location reference's database. 214 215 This is a shortcut for feature.location.ref_db 216 """) 217
218 - def _get_location_operator(self):
219 try: 220 return self.location.operator 221 except AttributeError: 222 return None
223
224 - def _set_location_operator(self, value):
225 if value: 226 if isinstance(self.location, CompoundLocation): 227 self.location.operator = value 228 elif self.location is None: 229 raise ValueError( 230 "Location is None so can't set its operator (to %r)" % value) 231 else: 232 raise ValueError( 233 "Only CompoundLocation gets an operator (%r)" % value)
234 location_operator = property(fget=_get_location_operator, fset=_set_location_operator, 235 doc="Location operator for compound locations (e.g. join).") 236
237 - def __repr__(self):
238 """A string representation of the record for debugging.""" 239 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 240 if self.type: 241 answer += ", type=%s" % repr(self.type) 242 if self.location_operator: 243 answer += ", location_operator=%s" % repr(self.location_operator) 244 if self.id and self.id != "<unknown id>": 245 answer += ", id=%s" % repr(self.id) 246 if self.ref: 247 answer += ", ref=%s" % repr(self.ref) 248 if self.ref_db: 249 answer += ", ref_db=%s" % repr(self.ref_db) 250 answer += ")" 251 return answer
252
253 - def __str__(self):
254 """A readable summary of the feature intended to be printed to screen. 255 """ 256 out = "type: %s\n" % self.type 257 out += "location: %s\n" % self.location 258 if self.id and self.id != "<unknown id>": 259 out += "id: %s\n" % self.id 260 out += "qualifiers:\n" 261 for qual_key in sorted(self.qualifiers): 262 out += " Key: %s, Value: %s\n" % (qual_key, 263 self.qualifiers[qual_key]) 264 return out
265
266 - def _shift(self, offset):
267 """Returns a copy of the feature with its location shifted (PRIVATE). 268 269 The annotation qaulifiers are copied.""" 270 return SeqFeature(location=self.location._shift(offset), 271 type=self.type, 272 location_operator=self.location_operator, 273 id=self.id, 274 qualifiers=dict(self.qualifiers.items()))
275
276 - def _flip(self, length):
277 """Returns a copy of the feature with its location flipped (PRIVATE). 278 279 The argument length gives the length of the parent sequence. For 280 example a location 0..20 (+1 strand) with parent length 30 becomes 281 after flipping 10..30 (-1 strand). Strandless (None) or unknown 282 strand (0) remain like that - just their end points are changed. 283 284 The annotation qaulifiers are copied. 285 """ 286 return SeqFeature(location=self.location._flip(length), 287 type=self.type, 288 location_operator=self.location_operator, 289 id=self.id, 290 qualifiers=dict(self.qualifiers.items()))
291
292 - def extract(self, parent_sequence):
293 """Extract feature sequence from the supplied parent sequence. 294 295 The parent_sequence can be a Seq like object or a string, and will 296 generally return an object of the same type. The exception to this is 297 a MutableSeq as the parent sequence will return a Seq object. 298 299 This should cope with complex locations including complements, joins 300 and fuzzy positions. Even mixed strand features should work! This 301 also covers features on protein sequences (e.g. domains), although 302 here reverse strand features are not permitted. 303 304 >>> from Bio.Seq import Seq 305 >>> from Bio.Alphabet import generic_protein 306 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 307 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 308 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 309 >>> f.extract(seq) 310 Seq('VALIVIC', ProteinAlphabet()) 311 312 If the FeatureLocation is None, e.g. when parsing invalid locus 313 locations in the GenBank parser, extract() will raise a ValueError. 314 315 >>> from Bio.Seq import Seq 316 >>> from Bio.SeqFeature import SeqFeature 317 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 318 >>> f = SeqFeature(None, type="domain") 319 >>> f.extract(seq) 320 Traceback (most recent call last): 321 ... 322 ValueError: The feature's .location is None. Check the sequence file for a valid location. 323 324 Note - currently only compound features of type "join" are supported. 325 """ 326 if self.location is None: 327 raise ValueError("The feature's .location is None. Check the " 328 "sequence file for a valid location.") 329 return self.location.extract(parent_sequence)
330 331 # Python 3:
332 - def __bool__(self):
333 """Boolean value of an instance of this class (True). 334 335 This behaviour is for backwards compatibility, since until the 336 __len__ method was added, a SeqFeature always evaluated as True. 337 338 Note that in comparison, Seq objects, strings, lists, etc, will all 339 evaluate to False if they have length zero. 340 341 WARNING: The SeqFeature may in future evaluate to False when its 342 length is zero (in order to better match normal python behaviour)! 343 """ 344 return True
345 346 # Python 2: 347 __nonzero__ = __bool__ 348
349 - def __len__(self):
350 """Returns the length of the region described by a feature. 351 352 >>> from Bio.Seq import Seq 353 >>> from Bio.Alphabet import generic_protein 354 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 355 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 356 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 357 >>> len(f) 358 7 359 >>> f.extract(seq) 360 Seq('VALIVIC', ProteinAlphabet()) 361 >>> len(f.extract(seq)) 362 7 363 364 This is a proxy for taking the length of the feature's location: 365 366 >>> len(f.location) 367 7 368 369 For simple features this is the same as the region spanned (end 370 position minus start position using Pythonic counting). However, for 371 a compound location (e.g. a CDS as the join of several exons) the 372 gaps are not counted (e.g. introns). This ensures that len(f) matches 373 len(f.extract(parent_seq)), and also makes sure things work properly 374 with features wrapping the origin etc. 375 """ 376 return len(self.location)
377
378 - def __iter__(self):
379 """Iterate over the parent positions within the feature. 380 381 The iteration order is strand aware, and can be thought of as moving 382 along the feature using the parent sequence coordinates: 383 384 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 385 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 386 >>> len(f) 387 5 388 >>> for i in f: print(i) 389 9 390 8 391 7 392 6 393 5 394 >>> list(f) 395 [9, 8, 7, 6, 5] 396 397 This is a proxy for iterating over the location, 398 399 >>> list(f.location) 400 [9, 8, 7, 6, 5] 401 """ 402 return iter(self.location)
403
404 - def __contains__(self, value):
405 """Check if an integer position is within the feature. 406 407 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 408 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 409 >>> len(f) 410 5 411 >>> [i for i in range(15) if i in f] 412 [5, 6, 7, 8, 9] 413 414 For example, to see which features include a SNP position, you could 415 use this: 416 417 >>> from Bio import SeqIO 418 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") 419 >>> for f in record.features: 420 ... if 1750 in f: 421 ... print("%s %s" % (f.type, f.location)) 422 source [0:154478](+) 423 gene [1716:4347](-) 424 tRNA join{[4310:4347](-), [1716:1751](-)} 425 426 Note that for a feature defined as a join of several subfeatures (e.g. 427 the union of several exons) the gaps are not checked (e.g. introns). 428 In this example, the tRNA location is defined in the GenBank file as 429 complement(join(1717..1751,4311..4347)), so that position 1760 falls 430 in the gap: 431 432 >>> for f in record.features: 433 ... if 1760 in f: 434 ... print("%s %s" % (f.type, f.location)) 435 source [0:154478](+) 436 gene [1716:4347](-) 437 438 Note that additional care may be required with fuzzy locations, for 439 example just before a BeforePosition: 440 441 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 442 >>> from Bio.SeqFeature import BeforePosition 443 >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain") 444 >>> len(f) 445 5 446 >>> [i for i in range(10) if i in f] 447 [3, 4, 5, 6, 7] 448 449 Note that is is a proxy for testing membership on the location. 450 451 >>> [i for i in range(10) if i in f.location] 452 [3, 4, 5, 6, 7] 453 """ 454 return value in self.location
455
456 457 # --- References 458 459 460 # TODO -- Will this hold PubMed and Medline information decently? 461 -class Reference(object):
462 """Represent a Generic Reference object. 463 464 Attributes: 465 o location - A list of Location objects specifying regions of 466 the sequence that the references correspond to. If no locations are 467 specified, the entire sequence is assumed. 468 o authors - A big old string, or a list split by author, of authors 469 for the reference. 470 o title - The title of the reference. 471 o journal - Journal the reference was published in. 472 o medline_id - A medline reference for the article. 473 o pubmed_id - A pubmed reference for the article. 474 o comment - A place to stick any comments about the reference. 475 """ 476
477 - def __init__(self):
478 self.location = [] 479 self.authors = '' 480 self.consrtm = '' 481 self.title = '' 482 self.journal = '' 483 self.medline_id = '' 484 self.pubmed_id = '' 485 self.comment = ''
486
487 - def __str__(self):
488 """Output an informative string for debugging. 489 """ 490 out = "" 491 for single_location in self.location: 492 out += "location: %s\n" % single_location 493 out += "authors: %s\n" % self.authors 494 if self.consrtm: 495 out += "consrtm: %s\n" % self.consrtm 496 out += "title: %s\n" % self.title 497 out += "journal: %s\n" % self.journal 498 out += "medline id: %s\n" % self.medline_id 499 out += "pubmed id: %s\n" % self.pubmed_id 500 out += "comment: %s\n" % self.comment 501 return out
502
503 - def __repr__(self):
504 # TODO - Update this is __init__ later accpets values 505 return "%s(title=%s, ...)" % (self.__class__.__name__, 506 repr(self.title))
507
508 - def __eq__(self, other):
509 """Check if two Reference objects should be considered equal 510 511 Note that the location is not compared, as __eq__ for the 512 FeatureLocation class is not defined. 513 """ 514 return self.authors == other.authors and \ 515 self.consrtm == other.consrtm and \ 516 self.title == other.title and \ 517 self.journal == other.journal and \ 518 self.medline_id == other.medline_id and \ 519 self.pubmed_id == other.pubmed_id and \ 520 self.comment == other.comment
521
522 523 # --- Handling feature locations 524 525 -class FeatureLocation(object):
526 """Specify the location of a feature along a sequence. 527 528 The FeatureLocation is used for simple continuous features, which can 529 be described as running from a start position to and end position 530 (optionally with a strand and reference information). More complex 531 locations made up from several non-continuous parts (e.g. a coding 532 sequence made up of several exons) are described using a SeqFeature 533 with a CompoundLocation. 534 535 Note that the start and end location numbering follow Python's scheme, 536 thus a GenBank entry of 123..150 (one based counting) becomes a location 537 of [122:150] (zero based counting). 538 539 >>> from Bio.SeqFeature import FeatureLocation 540 >>> f = FeatureLocation(122, 150) 541 >>> print(f) 542 [122:150] 543 >>> print(f.start) 544 122 545 >>> print(f.end) 546 150 547 >>> print(f.strand) 548 None 549 550 Note the strand defaults to None. If you are working with nucleotide 551 sequences you'd want to be explicit if it is the forward strand: 552 553 >>> from Bio.SeqFeature import FeatureLocation 554 >>> f = FeatureLocation(122, 150, strand=+1) 555 >>> print(f) 556 [122:150](+) 557 >>> print(f.strand) 558 1 559 560 Note that for a parent sequence of length n, the FeatureLocation 561 start and end must satisfy the inequality 0 <= start <= end <= n. 562 This means even for features on the reverse strand of a nucleotide 563 sequence, we expect the 'start' coordinate to be less than the 564 'end'. 565 566 >>> from Bio.SeqFeature import FeatureLocation 567 >>> r = FeatureLocation(122, 150, strand=-1) 568 >>> print(r) 569 [122:150](-) 570 >>> print(r.start) 571 122 572 >>> print(r.end) 573 150 574 >>> print(r.strand) 575 -1 576 577 i.e. Rather than thinking of the 'start' and 'end' biologically in a 578 strand aware manor, think of them as the 'left most' or 'minimum' 579 boundary, and the 'right most' or 'maximum' boundary of the region 580 being described. This is particularly important with compound 581 locations describing non-continuous regions. 582 583 In the example above we have used standard exact positions, but there 584 are also specialised position objects used to represent fuzzy positions 585 as well, for example a GenBank location like complement(<123..150) 586 would use a BeforePosition object for the start. 587 """ 588
589 - def __init__(self, start, end, strand=None, ref=None, ref_db=None):
590 """Specify the start, end, strand etc of a sequence feature. 591 592 start and end arguments specify the values where the feature begins 593 and ends. These can either by any of the ``*Position`` objects that 594 inherit from AbstractPosition, or can just be integers specifying the 595 position. In the case of integers, the values are assumed to be 596 exact and are converted in ExactPosition arguments. This is meant 597 to make it easy to deal with non-fuzzy ends. 598 599 i.e. Short form: 600 601 >>> from Bio.SeqFeature import FeatureLocation 602 >>> loc = FeatureLocation(5, 10, strand=-1) 603 >>> print(loc) 604 [5:10](-) 605 606 Explicit form: 607 608 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 609 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1) 610 >>> print(loc) 611 [5:10](-) 612 613 Other fuzzy positions are used similarly, 614 615 >>> from Bio.SeqFeature import FeatureLocation 616 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 617 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1) 618 >>> print(loc2) 619 [<5:>10](-) 620 621 For nucleotide features you will also want to specify the strand, 622 use 1 for the forward (plus) strand, -1 for the reverse (negative) 623 strand, 0 for stranded but strand unknown (? in GFF3), or None for 624 when the strand does not apply (dot in GFF3), e.g. features on 625 proteins. 626 627 >>> loc = FeatureLocation(5, 10, strand=+1) 628 >>> print(loc) 629 [5:10](+) 630 >>> print(loc.strand) 631 1 632 633 Normally feature locations are given relative to the parent 634 sequence you are working with, but an explicit accession can 635 be given with the optional ref and db_ref strings: 636 637 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1) 638 >>> print(loc) 639 AL391218.9[105172:108462](+) 640 >>> print(loc.ref) 641 AL391218.9 642 643 """ 644 # TODO - Check 0 <= start <= end (<= length of reference) 645 if isinstance(start, AbstractPosition): 646 self._start = start 647 elif _is_int_or_long(start): 648 self._start = ExactPosition(start) 649 else: 650 raise TypeError("start=%r %s" % (start, type(start))) 651 if isinstance(end, AbstractPosition): 652 self._end = end 653 elif _is_int_or_long(end): 654 self._end = ExactPosition(end) 655 else: 656 raise TypeError("end=%r %s" % (end, type(end))) 657 self.strand = strand 658 self.ref = ref 659 self.ref_db = ref_db
660
661 - def _get_strand(self):
662 return self._strand
663
664 - def _set_strand(self, value):
665 if value not in [+1, -1, 0, None]: 666 raise ValueError("Strand should be +1, -1, 0 or None, not %r" 667 % value) 668 self._strand = value
669 670 strand = property(fget=_get_strand, fset=_set_strand, 671 doc="Strand of the location (+1, -1, 0 or None).") 672
673 - def __str__(self):
674 """Returns a representation of the location (with python counting). 675 676 For the simple case this uses the python splicing syntax, [122:150] 677 (zero based counting) which GenBank would call 123..150 (one based 678 counting). 679 """ 680 answer = "[%s:%s]" % (self._start, self._end) 681 if self.ref and self.ref_db: 682 answer = "%s:%s%s" % (self.ref_db, self.ref, answer) 683 elif self.ref: 684 answer = self.ref + answer 685 # Is ref_db without ref meaningful? 686 if self.strand is None: 687 return answer 688 elif self.strand == +1: 689 return answer + "(+)" 690 elif self.strand == -1: 691 return answer + "(-)" 692 else: 693 # strand = 0, stranded but strand unknown, ? in GFF3 694 return answer + "(?)"
695
696 - def __repr__(self):
697 """A string representation of the location for debugging.""" 698 optional = "" 699 if self.strand is not None: 700 optional += ", strand=%r" % self.strand 701 if self.ref is not None: 702 optional += ", ref=%r" % self.ref 703 if self.ref_db is not None: 704 optional += ", ref_db=%r" % self.ref_db 705 return "%s(%r, %r%s)" \ 706 % (self.__class__.__name__, self.start, self.end, optional)
707
708 - def __add__(self, other):
709 """Combine location with another feature location, or shift it. 710 711 You can add two feature locations to make a join CompoundLocation: 712 713 >>> from Bio.SeqFeature import FeatureLocation 714 >>> f1 = FeatureLocation(5, 10) 715 >>> f2 = FeatureLocation(20, 30) 716 >>> combined = f1 + f2 717 >>> print(combined) 718 join{[5:10], [20:30]} 719 720 This is thus equivalent to: 721 722 >>> from Bio.SeqFeature import CompoundLocation 723 >>> join = CompoundLocation([f1, f2]) 724 >>> print(join) 725 join{[5:10], [20:30]} 726 727 You can also use sum(...) in this way: 728 729 >>> join = sum([f1, f2]) 730 >>> print(join) 731 join{[5:10], [20:30]} 732 733 Furthermore, you can combine a FeatureLocation with a CompoundLocation 734 in this way. 735 736 Separately, adding an integer will give a new FeatureLocation with 737 its start and end offset by that amount. For example: 738 739 >>> print(f1) 740 [5:10] 741 >>> print(f1 + 100) 742 [105:110] 743 >>> print(200 + f1) 744 [205:210] 745 746 This can be useful when editing annotation. 747 """ 748 if isinstance(other, FeatureLocation): 749 return CompoundLocation([self, other]) 750 elif isinstance(other, int): 751 return self._shift(other) 752 else: 753 # This will allow CompoundLocation's __radd__ to be called: 754 return NotImplemented
755
756 - def __radd__(self, other):
757 if isinstance(other, int): 758 return self._shift(other) 759 else: 760 return NotImplemented
761
762 - def __nonzero__(self):
763 """Returns True regardless of the length of the feature. 764 765 This behaviour is for backwards compatibility, since until the 766 __len__ method was added, a FeatureLocation always evaluated as True. 767 768 Note that in comparison, Seq objects, strings, lists, etc, will all 769 evaluate to False if they have length zero. 770 771 WARNING: The FeatureLocation may in future evaluate to False when its 772 length is zero (in order to better match normal python behaviour)! 773 """ 774 return True
775
776 - def __len__(self):
777 """Returns the length of the region described by the FeatureLocation. 778 779 Note that extra care may be needed for fuzzy locations, e.g. 780 781 >>> from Bio.SeqFeature import FeatureLocation 782 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 783 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 784 >>> len(loc) 785 5 786 """ 787 return int(self._end) - int(self._start)
788
789 - def __contains__(self, value):
790 """Check if an integer position is within the FeatureLocation. 791 792 Note that extra care may be needed for fuzzy locations, e.g. 793 794 >>> from Bio.SeqFeature import FeatureLocation 795 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 796 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 797 >>> len(loc) 798 5 799 >>> [i for i in range(15) if i in loc] 800 [5, 6, 7, 8, 9] 801 """ 802 if not isinstance(value, int): 803 raise ValueError("Currently we only support checking for integer " 804 "positions being within a FeatureLocation.") 805 if value < self._start or value >= self._end: 806 return False 807 else: 808 return True
809
810 - def __iter__(self):
811 """Iterate over the parent positions within the FeatureLocation. 812 813 >>> from Bio.SeqFeature import FeatureLocation 814 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 815 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 816 >>> len(loc) 817 5 818 >>> for i in loc: print(i) 819 5 820 6 821 7 822 8 823 9 824 >>> list(loc) 825 [5, 6, 7, 8, 9] 826 >>> [i for i in range(15) if i in loc] 827 [5, 6, 7, 8, 9] 828 829 Note this is strand aware: 830 831 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1) 832 >>> list(loc) 833 [9, 8, 7, 6, 5] 834 """ 835 if self.strand == -1: 836 for i in range(self._end - 1, self._start - 1, -1): 837 yield i 838 else: 839 for i in range(self._start, self._end): 840 yield i
841
842 - def _shift(self, offset):
843 """Returns a copy of the location shifted by the offset (PRIVATE).""" 844 # TODO - What if offset is a fuzzy position? 845 if self.ref or self.ref_db: 846 # TODO - Return self? 847 raise ValueError("Feature references another sequence.") 848 return FeatureLocation(start=self._start._shift(offset), 849 end=self._end._shift(offset), 850 strand=self.strand)
851
852 - def _flip(self, length):
853 """Returns a copy of the location after the parent is reversed (PRIVATE).""" 854 if self.ref or self.ref_db: 855 # TODO - Return self? 856 raise ValueError("Feature references another sequence.") 857 # Note this will flip the start and end too! 858 if self.strand == +1: 859 flip_strand = -1 860 elif self.strand == -1: 861 flip_strand = +1 862 else: 863 # 0 or None 864 flip_strand = self.strand 865 return FeatureLocation(start=self._end._flip(length), 866 end=self._start._flip(length), 867 strand=flip_strand)
868 869 @property
870 - def parts(self):
871 """Read only list of parts (always one, the Feature Location). 872 873 This is a convenience property allowing you to write code handling 874 both simple FeatureLocation objects (with one part) and more complex 875 CompoundLocation objects (with multiple parts) interchangeably. 876 """ 877 return [self]
878 879 @property
880 - def start(self):
881 """Start location - left most (minimum) value, regardless of strand. 882 883 Read only, returns an integer like position object, possibly a fuzzy 884 position. 885 """ 886 return self._start
887 888 @property
889 - def end(self):
890 """End location - right most (maximum) value, regardless of strand. 891 892 Read only, returns an integer like position object, possibly a fuzzy 893 position. 894 """ 895 return self._end
896 897 @property
898 - def nofuzzy_start(self):
899 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 900 901 This is now an alias for int(feature.start), which should be 902 used in preference -- unless you are trying to support old 903 versions of Biopython. 904 """ 905 try: 906 return int(self._start) 907 except TypeError: 908 if isinstance(self._start, UnknownPosition): 909 return None 910 raise
911 912 @property
913 - def nofuzzy_end(self):
914 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 915 916 This is now an alias for int(feature.end), which should be 917 used in preference -- unless you are trying to support old 918 versions of Biopython. 919 """ 920 try: 921 return int(self._end) 922 except TypeError: 923 if isinstance(self._end, UnknownPosition): 924 return None 925 raise
926
927 - def extract(self, parent_sequence):
928 """Extract feature sequence from the supplied parent sequence.""" 929 if self.ref or self.ref_db: 930 # TODO - Take a dictionary as an optional argument? 931 raise ValueError("Feature references another sequence.") 932 if isinstance(parent_sequence, MutableSeq): 933 # This avoids complications with reverse complements 934 # (the MutableSeq reverse complement acts in situ) 935 parent_sequence = parent_sequence.toseq() 936 f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] 937 if self.strand == -1: 938 try: 939 f_seq = f_seq.reverse_complement() 940 except AttributeError: 941 assert isinstance(f_seq, str) 942 f_seq = reverse_complement(f_seq) 943 return f_seq
944
945 946 -class CompoundLocation(object):
947 """For handling joins etc where a feature location has several parts.""" 948
949 - def __init__(self, parts, operator="join"):
950 """Create a compound location with several parts. 951 952 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 953 >>> f1 = FeatureLocation(10, 40, strand=+1) 954 >>> f2 = FeatureLocation(50, 59, strand=+1) 955 >>> f = CompoundLocation([f1, f2]) 956 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) 957 True 958 >>> print(f.operator) 959 join 960 >>> 5 in f 961 False 962 >>> 15 in f 963 True 964 >>> f.strand 965 1 966 967 Notice that the strand of the compound location is computed 968 automatically - in the case of mixed strands on the sub-locations 969 the overall strand is set to None. 970 971 >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1), 972 ... FeatureLocation(10, 13, strand=-1)]) 973 >>> print(f.strand) 974 None 975 >>> len(f) 976 6 977 >>> list(f) 978 [3, 4, 5, 12, 11, 10] 979 980 The example above doing list(f) iterates over the coordinates within the 981 feature. This allows you to use max and min on the location, to find the 982 range covered: 983 984 >>> min(f) 985 3 986 >>> max(f) 987 12 988 989 More generally, you can use the compound location's start and end which 990 give the full range covered, 0 <= start <= end <= full sequence length. 991 992 >>> f.start == min(f) 993 True 994 >>> f.end == max(f) + 1 995 True 996 997 This is consistent with the behaviour of the simple FeatureLocation for 998 a single region, where again the 'start' and 'end' do not necessarily 999 give the biological start and end, but rather the 'minimal' and 'maximal' 1000 coordinate boundaries. 1001 1002 Note that adding locations provides a more intuitive method of 1003 construction: 1004 1005 >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1) 1006 >>> len(f) 1007 6 1008 >>> list(f) 1009 [3, 4, 5, 12, 11, 10] 1010 """ 1011 self.operator = operator 1012 self.parts = list(parts) 1013 for loc in self.parts: 1014 if not isinstance(loc, FeatureLocation): 1015 raise ValueError("CompoundLocation should be given a list of " 1016 "FeatureLocation objects, not %s" % loc.__class__) 1017 if len(parts) < 2: 1018 raise ValueError( 1019 "CompoundLocation should have at least 2 parts, not %r" % parts)
1020
1021 - def __str__(self):
1022 """Returns a representation of the location (with python counting).""" 1023 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
1024
1025 - def __repr__(self):
1026 """String representation of the location for debugging.""" 1027 return "%s(%r, %r)" % (self.__class__.__name__, 1028 self.parts, self.operator)
1029
1030 - def _get_strand(self):
1031 # Historically a join on the reverse strand has been represented 1032 # in Biopython with both the parent SeqFeature and its children 1033 # (the exons for a CDS) all given a strand of -1. Likewise, for 1034 # a join feature on the forward strand they all have strand +1. 1035 # However, we must also consider evil mixed strand examples like 1036 # this, join(complement(69611..69724),139856..140087,140625..140650) 1037 if len(set(loc.strand for loc in self.parts)) == 1: 1038 return self.parts[0].strand 1039 else: 1040 return None # i.e. mixed strands
1041
1042 - def _set_strand(self, value):
1043 # Should this be allowed/encouraged? 1044 for loc in self.parts: 1045 loc.strand = value
1046 strand = property(fget=_get_strand, fset=_set_strand, 1047 doc="""Overall strand of the compound location. 1048 1049 If all the parts have the same strand, that is returned. Otherwise 1050 for mixed strands, this returns None. 1051 1052 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1053 >>> f1 = FeatureLocation(15, 17, strand=1) 1054 >>> f2 = FeatureLocation(20, 30, strand=-1) 1055 >>> f = f1 + f2 1056 >>> f1.strand 1057 1 1058 >>> f2.strand 1059 -1 1060 >>> f.strand 1061 >>> f.strand is None 1062 True 1063 1064 If you set the strand of a CompoundLocation, this is applied to 1065 all the parts - use with caution: 1066 1067 >>> f.strand = 1 1068 >>> f1.strand 1069 1 1070 >>> f2.strand 1071 1 1072 >>> f.strand 1073 1 1074 1075 """) 1076
1077 - def __add__(self, other):
1078 """Combine locations, or shift the location by an integer offset. 1079 1080 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1081 >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30) 1082 >>> print(f1) 1083 join{[15:17], [20:30]} 1084 1085 You can add another FeatureLocation: 1086 1087 >>> print(f1 + FeatureLocation(40, 50)) 1088 join{[15:17], [20:30], [40:50]} 1089 >>> print(FeatureLocation(5, 10) + f1) 1090 join{[5:10], [15:17], [20:30]} 1091 1092 You can also add another CompoundLocation: 1093 1094 >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70) 1095 >>> print(f2) 1096 join{[40:50], [60:70]} 1097 >>> print(f1 + f2) 1098 join{[15:17], [20:30], [40:50], [60:70]} 1099 1100 Also, as with the FeatureLocation, adding an integer shifts the 1101 location's co-ordinates by that offset: 1102 1103 >>> print(f1 + 100) 1104 join{[115:117], [120:130]} 1105 >>> print(200 + f1) 1106 join{[215:217], [220:230]} 1107 >>> print(f1 + (-5)) 1108 join{[10:12], [15:25]} 1109 """ 1110 if isinstance(other, FeatureLocation): 1111 return CompoundLocation(self.parts + [other], self.operator) 1112 elif isinstance(other, CompoundLocation): 1113 if self.operator != other.operator: 1114 # Handle join+order -> order as a special case? 1115 raise ValueError("Mixed operators %s and %s" 1116 % (self.operator, other.operator)) 1117 return CompoundLocation(self.parts + other.parts, self.operator) 1118 elif isinstance(other, int): 1119 return self._shift(other) 1120 else: 1121 raise NotImplementedError
1122
1123 - def __radd__(self, other):
1124 """Combine locations.""" 1125 if isinstance(other, FeatureLocation): 1126 return CompoundLocation([other] + self.parts, self.operator) 1127 elif isinstance(other, int): 1128 return self._shift(other) 1129 else: 1130 raise NotImplementedError
1131
1132 - def __contains__(self, value):
1133 """Check if an integer position is within the location.""" 1134 for loc in self.parts: 1135 if value in loc: 1136 return True 1137 return False
1138
1139 - def __nonzero__(self):
1140 """Returns True regardless of the length of the feature. 1141 1142 This behaviour is for backwards compatibility, since until the 1143 __len__ method was added, a FeatureLocation always evaluated as True. 1144 1145 Note that in comparison, Seq objects, strings, lists, etc, will all 1146 evaluate to False if they have length zero. 1147 1148 WARNING: The FeatureLocation may in future evaluate to False when its 1149 length is zero (in order to better match normal python behaviour)! 1150 """ 1151 return True
1152
1153 - def __len__(self):
1154 return sum(len(loc) for loc in self.parts)
1155
1156 - def __iter__(self):
1157 for loc in self.parts: 1158 for pos in loc: 1159 yield pos
1160
1161 - def _shift(self, offset):
1162 """Returns a copy of the location shifted by the offset (PRIVATE).""" 1163 return CompoundLocation([loc._shift(offset) for loc in self.parts], 1164 self.operator)
1165
1166 - def _flip(self, length):
1167 """Returns a copy of the location after the parent is reversed (PRIVATE). 1168 1169 Note that the order of the parts is NOT reversed too. Consider a CDS 1170 on the forward strand with exons small, medium and large (in length). 1171 Once we change the frame of reference to the reverse complement strand, 1172 the start codon is still part of the small exon, and the stop codon 1173 still part of the large exon - so the part order remains the same! 1174 1175 Here is an artificial example, were the features map to the two upper 1176 case regions and the lower case runs of n are not used: 1177 1178 >>> from Bio.Seq import Seq 1179 >>> from Bio.SeqFeature import FeatureLocation 1180 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") 1181 >>> small = FeatureLocation(5, 20, strand=1) 1182 >>> large = FeatureLocation(28, 52, strand=1) 1183 >>> location = small + large 1184 >>> print(small) 1185 [5:20](+) 1186 >>> print(large) 1187 [28:52](+) 1188 >>> print(location) 1189 join{[5:20](+), [28:52](+)} 1190 >>> for part in location.parts: 1191 ... print(len(part)) 1192 ... 1193 15 1194 24 1195 1196 As you can see, this is a silly example where each "exon" is a word: 1197 1198 >>> print(small.extract(dna).translate()) 1199 SILLY 1200 >>> print(large.extract(dna).translate()) 1201 EXAMPLE* 1202 >>> print(location.extract(dna).translate()) 1203 SILLYEXAMPLE* 1204 >>> for part in location.parts: 1205 ... print(part.extract(dna).translate()) 1206 ... 1207 SILLY 1208 EXAMPLE* 1209 1210 Now, let's look at this from the reverse strand frame of reference: 1211 1212 >>> flipped_dna = dna.reverse_complement() 1213 >>> flipped_location = location._flip(len(dna)) 1214 >>> print(flipped_location.extract(flipped_dna).translate()) 1215 SILLYEXAMPLE* 1216 >>> for part in flipped_location.parts: 1217 ... print(part.extract(flipped_dna).translate()) 1218 ... 1219 SILLY 1220 EXAMPLE* 1221 1222 The key point here is the first part of the CompoundFeature is still the 1223 small exon, while the second part is still the large exon: 1224 1225 >>> for part in flipped_location.parts: 1226 ... print(len(part)) 1227 ... 1228 15 1229 24 1230 >>> print(flipped_location) 1231 join{[37:52](-), [5:29](-)} 1232 1233 Notice the parts are not reversed. However, there was a bug here in older 1234 versions of Biopython which would have given join{[5:29](-), [37:52](-)} 1235 and the translation would have wrongly been "EXAMPLE*SILLY" instead. 1236 1237 """ 1238 return CompoundLocation([loc._flip(length) for loc in self.parts], 1239 self.operator)
1240 1241 @property
1242 - def start(self):
1243 """Start location - left most (minimum) value, regardless of strand. 1244 1245 Read only, returns an integer like position object, possibly a fuzzy 1246 position. 1247 1248 For the special case of a CompoundLocation wrapping the origin of a 1249 circular genome, this will return zero. 1250 """ 1251 return min(loc.start for loc in self.parts)
1252 1253 @property
1254 - def end(self):
1255 """End location - right most (maximum) value, regardless of strand. 1256 1257 Read only, returns an integer like position object, possibly a fuzzy 1258 position. 1259 1260 For the special case of a CompoundLocation wrapping the origin of 1261 a circular genome this will match the genome length (minus one 1262 given how Python counts from zero). 1263 """ 1264 return max(loc.end for loc in self.parts)
1265 1266 @property
1267 - def nofuzzy_start(self):
1268 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1269 1270 This is an alias for int(feature.start), which should be used in 1271 preference -- unless you are trying to support old versions of 1272 Biopython. 1273 """ 1274 try: 1275 return int(self.start) 1276 except TypeError: 1277 if isinstance(self.start, UnknownPosition): 1278 return None 1279 raise
1280 1281 @property
1282 - def nofuzzy_end(self):
1283 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1284 1285 This is an alias for int(feature.end), which should be used in 1286 preference -- unless you are trying to support old versions of 1287 Biopython. 1288 """ 1289 try: 1290 return int(self.end) 1291 except TypeError: 1292 if isinstance(self.end, UnknownPosition): 1293 return None 1294 raise
1295 1296 @property
1297 - def ref(self):
1298 """CompoundLocation's don't have a ref (dummy method for API compatibility).""" 1299 return None
1300 1301 @property
1302 - def ref_db(self):
1303 """CompoundLocation's don't have a ref_db (dummy method for API compatibility).""" 1304 return None
1305
1306 - def extract(self, parent_sequence):
1307 """Extract feature sequence from the supplied parent sequence.""" 1308 # This copes with mixed strand features & all on reverse: 1309 parts = [loc.extract(parent_sequence) for loc in self.parts] 1310 # We use addition rather than a join to avoid alphabet issues: 1311 f_seq = parts[0] 1312 for part in parts[1:]: 1313 f_seq += part 1314 return f_seq
1315
1316 1317 -class AbstractPosition(object):
1318 """Abstract base class representing a position.""" 1319
1320 - def __repr__(self):
1321 """String representation of the location for debugging.""" 1322 return "%s(...)" % (self.__class__.__name__)
1323
1324 1325 -class ExactPosition(int, AbstractPosition):
1326 """Specify the specific position of a boundary. 1327 1328 o position - The position of the boundary. 1329 o extension - An optional argument which must be zero since we don't 1330 have an extension. The argument is provided so that the same number of 1331 arguments can be passed to all position types. 1332 1333 In this case, there is no fuzziness associated with the position. 1334 1335 >>> p = ExactPosition(5) 1336 >>> p 1337 ExactPosition(5) 1338 >>> print(p) 1339 5 1340 1341 >>> isinstance(p, AbstractPosition) 1342 True 1343 >>> isinstance(p, int) 1344 True 1345 1346 Integer comparisons and operations should work as expected: 1347 1348 >>> p == 5 1349 True 1350 >>> p < 6 1351 True 1352 >>> p <= 5 1353 True 1354 >>> p + 10 1355 15 1356 1357 """
1358 - def __new__(cls, position, extension=0):
1359 if extension != 0: 1360 raise AttributeError("Non-zero extension %s for exact position." 1361 % extension) 1362 return int.__new__(cls, position)
1363
1364 - def __repr__(self):
1365 """String representation of the ExactPosition location for debugging.""" 1366 return "%s(%i)" % (self.__class__.__name__, int(self))
1367 1368 @property
1369 - def position(self):
1370 """Legacy attribute to get position as integer (OBSOLETE).""" 1371 return int(self)
1372 1373 @property
1374 - def extension(self):
1375 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1376 return 0
1377
1378 - def _shift(self, offset):
1379 # By default preserve any subclass 1380 return self.__class__(int(self) + offset)
1381
1382 - def _flip(self, length):
1383 # By default perserve any subclass 1384 return self.__class__(length - int(self))
1385
1386 1387 -class UncertainPosition(ExactPosition):
1388 """Specify a specific position which is uncertain. 1389 1390 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the 1391 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. 1392 """ 1393 pass
1394
1395 1396 -class UnknownPosition(AbstractPosition):
1397 """Specify a specific position which is unknown (has no position). 1398 1399 This is used in UniProt, e.g. ? or in the XML as unknown. 1400 """ 1401
1402 - def __repr__(self):
1403 """String representation of the UnknownPosition location for debugging.""" 1404 return "%s()" % self.__class__.__name__
1405
1406 - def __hash__(self):
1407 return hash(None)
1408 1409 @property
1410 - def position(self):
1411 """Legacy attribute to get position (None) (OBSOLETE).""" 1412 return None
1413 1414 @property
1415 - def extension(self):
1416 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1417 return 0
1418
1419 - def _shift(self, offset):
1420 return self
1421
1422 - def _flip(self, length):
1423 return self
1424
1425 1426 -class WithinPosition(int, AbstractPosition):
1427 """Specify the position of a boundary within some coordinates. 1428 1429 Arguments: 1430 o position - The default integer position 1431 o left - The start (left) position of the boundary 1432 o right - The end (right) position of the boundary 1433 1434 This allows dealing with a position like ((1.4)..100). This 1435 indicates that the start of the sequence is somewhere between 1 1436 and 4. Since this is a start coordinate, it should acts like 1437 it is at position 1 (or in Python counting, 0). 1438 1439 >>> p = WithinPosition(10, 10, 13) 1440 >>> p 1441 WithinPosition(10, left=10, right=13) 1442 >>> print(p) 1443 (10.13) 1444 >>> int(p) 1445 10 1446 1447 Basic integer comparisons and operations should work as though 1448 this were a plain integer: 1449 1450 >>> p == 10 1451 True 1452 >>> p in [9, 10, 11] 1453 True 1454 >>> p < 11 1455 True 1456 >>> p + 10 1457 20 1458 1459 >>> isinstance(p, WithinPosition) 1460 True 1461 >>> isinstance(p, AbstractPosition) 1462 True 1463 >>> isinstance(p, int) 1464 True 1465 1466 Note this also applies for comparison to other position objects, 1467 where again the integer behaviour is used: 1468 1469 >>> p == 10 1470 True 1471 >>> p == ExactPosition(10) 1472 True 1473 >>> p == BeforePosition(10) 1474 True 1475 >>> p == AfterPosition(10) 1476 True 1477 1478 If this were an end point, you would want the position to be 13: 1479 1480 >>> p2 = WithinPosition(13, 10, 13) 1481 >>> p2 1482 WithinPosition(13, left=10, right=13) 1483 >>> print(p2) 1484 (10.13) 1485 >>> int(p2) 1486 13 1487 >>> p2 == 13 1488 True 1489 >>> p2 == ExactPosition(13) 1490 True 1491 1492 The old legacy properties of position and extension give the 1493 starting/lower/left position as an integer, and the distance 1494 to the ending/higher/right position as an integer. Note that 1495 the position object will act like either the left or the right 1496 end-point depending on how it was created: 1497 1498 >>> p.position == p2.position == 10 1499 True 1500 >>> p.extension == p2.extension == 3 1501 True 1502 >>> int(p) == int(p2) 1503 False 1504 >>> p == 10 1505 True 1506 >>> p2 == 13 1507 True 1508 1509 """
1510 - def __new__(cls, position, left, right):
1511 assert position == left or position == right, \ 1512 "WithinPosition: %r should match left %r or right %r" \ 1513 % (position, left, right) 1514 obj = int.__new__(cls, position) 1515 obj._left = left 1516 obj._right = right 1517 return obj
1518
1519 - def __repr__(self):
1520 """String representation of the WithinPosition location for debugging.""" 1521 return "%s(%i, left=%i, right=%i)" \ 1522 % (self.__class__.__name__, int(self), 1523 self._left, self._right)
1524
1525 - def __str__(self):
1526 return "(%s.%s)" % (self._left, self._right)
1527 1528 @property
1529 - def position(self):
1530 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1531 return self._left
1532 1533 @property
1534 - def extension(self):
1535 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1536 return self._right - self._left
1537
1538 - def _shift(self, offset):
1539 return self.__class__(int(self) + offset, 1540 self._left + offset, 1541 self._right + offset)
1542
1543 - def _flip(self, length):
1544 return self.__class__(length - int(self), 1545 length - self._right, 1546 length - self._left)
1547
1548 1549 -class BetweenPosition(int, AbstractPosition):
1550 """Specify the position of a boundary between two coordinates (OBSOLETE?). 1551 1552 Arguments: 1553 o position - The default integer position 1554 o left - The start (left) position of the boundary 1555 o right - The end (right) position of the boundary 1556 1557 This allows dealing with a position like 123^456. This 1558 indicates that the start of the sequence is somewhere between 1559 123 and 456. It is up to the parser to set the position argument 1560 to either boundary point (depending on if this is being used as 1561 a start or end of the feature). For example as a feature end: 1562 1563 >>> p = BetweenPosition(456, 123, 456) 1564 >>> p 1565 BetweenPosition(456, left=123, right=456) 1566 >>> print(p) 1567 (123^456) 1568 >>> int(p) 1569 456 1570 1571 Integer equality and comparison use the given position, 1572 1573 >>> p == 456 1574 True 1575 >>> p in [455, 456, 457] 1576 True 1577 >>> p > 300 1578 True 1579 1580 The old legacy properties of position and extension give the 1581 starting/lower/left position as an integer, and the distance 1582 to the ending/higher/right position as an integer. Note that 1583 the position object will act like either the left or the right 1584 end-point depending on how it was created: 1585 1586 >>> p2 = BetweenPosition(123, left=123, right=456) 1587 >>> p.position == p2.position == 123 1588 True 1589 >>> p.extension 1590 333 1591 >>> p2.extension 1592 333 1593 >>> p.extension == p2.extension == 333 1594 True 1595 >>> int(p) == int(p2) 1596 False 1597 >>> p == 456 1598 True 1599 >>> p2 == 123 1600 True 1601 1602 Note this potentially surprising behaviour: 1603 1604 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) 1605 True 1606 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) 1607 True 1608 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) 1609 True 1610 1611 i.e. For equality (and sorting) the position objects behave like 1612 integers. 1613 """
1614 - def __new__(cls, position, left, right):
1615 assert position == left or position == right 1616 obj = int.__new__(cls, position) 1617 obj._left = left 1618 obj._right = right 1619 return obj
1620
1621 - def __repr__(self):
1622 """String representation of the WithinPosition location for debugging.""" 1623 return "%s(%i, left=%i, right=%i)" \ 1624 % (self.__class__.__name__, int(self), 1625 self._left, self._right)
1626
1627 - def __str__(self):
1628 return "(%s^%s)" % (self._left, self._right)
1629 1630 @property
1631 - def position(self):
1632 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1633 return self._left
1634 1635 @property
1636 - def extension(self):
1637 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1638 return self._right - self._left
1639
1640 - def _shift(self, offset):
1641 return self.__class__(int(self) + offset, 1642 self._left + offset, 1643 self._right + offset)
1644
1645 - def _flip(self, length):
1646 return self.__class__(length - int(self), 1647 length - self._right, 1648 length - self._left)
1649
1650 1651 -class BeforePosition(int, AbstractPosition):
1652 """Specify a position where the actual location occurs before it. 1653 1654 Arguments: 1655 o position - The upper boundary of where the location can occur. 1656 o extension - An optional argument which must be zero since we don't 1657 have an extension. The argument is provided so that the same number of 1658 arguments can be passed to all position types. 1659 1660 This is used to specify positions like (<10..100) where the location 1661 occurs somewhere before position 10. 1662 1663 >>> p = BeforePosition(5) 1664 >>> p 1665 BeforePosition(5) 1666 >>> print(p) 1667 <5 1668 >>> int(p) 1669 5 1670 >>> p + 10 1671 15 1672 1673 Note this potentially surprising behaviour: 1674 1675 >>> p == ExactPosition(5) 1676 True 1677 >>> p == AfterPosition(5) 1678 True 1679 1680 Just remember that for equality and sorting the position objects act 1681 like integers. 1682 """ 1683 # Subclasses int so can't use __init__
1684 - def __new__(cls, position, extension=0):
1685 if extension != 0: 1686 raise AttributeError("Non-zero extension %s for exact position." 1687 % extension) 1688 return int.__new__(cls, position)
1689 1690 @property
1691 - def position(self):
1692 """Legacy attribute to get position as integer (OBSOLETE).""" 1693 return int(self)
1694 1695 @property
1696 - def extension(self):
1697 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1698 return 0
1699
1700 - def __repr__(self):
1701 """A string representation of the location for debugging.""" 1702 return "%s(%i)" % (self.__class__.__name__, int(self))
1703
1704 - def __str__(self):
1705 return "<%s" % self.position
1706
1707 - def _shift(self, offset):
1708 return self.__class__(int(self) + offset)
1709
1710 - def _flip(self, length):
1711 return AfterPosition(length - int(self))
1712
1713 1714 -class AfterPosition(int, AbstractPosition):
1715 """Specify a position where the actual location is found after it. 1716 1717 Arguments: 1718 o position - The lower boundary of where the location can occur. 1719 o extension - An optional argument which must be zero since we don't 1720 have an extension. The argument is provided so that the same number of 1721 arguments can be passed to all position types. 1722 1723 This is used to specify positions like (>10..100) where the location 1724 occurs somewhere after position 10. 1725 1726 >>> p = AfterPosition(7) 1727 >>> p 1728 AfterPosition(7) 1729 >>> print(p) 1730 >7 1731 >>> int(p) 1732 7 1733 >>> p + 10 1734 17 1735 1736 >>> isinstance(p, AfterPosition) 1737 True 1738 >>> isinstance(p, AbstractPosition) 1739 True 1740 >>> isinstance(p, int) 1741 True 1742 1743 Note this potentially surprising behaviour: 1744 1745 >>> p == ExactPosition(7) 1746 True 1747 >>> p == BeforePosition(7) 1748 True 1749 1750 Just remember that for equality and sorting the position objects act 1751 like integers. 1752 """ 1753 # Subclasses int so can't use __init__
1754 - def __new__(cls, position, extension=0):
1755 if extension != 0: 1756 raise AttributeError("Non-zero extension %s for exact position." 1757 % extension) 1758 return int.__new__(cls, position)
1759 1760 @property
1761 - def position(self):
1762 """Legacy attribute to get position as integer (OBSOLETE).""" 1763 return int(self)
1764 1765 @property
1766 - def extension(self):
1767 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1768 return 0
1769
1770 - def __repr__(self):
1771 """A string representation of the location for debugging.""" 1772 return "%s(%i)" % (self.__class__.__name__, int(self))
1773
1774 - def __str__(self):
1775 return ">%s" % self.position
1776
1777 - def _shift(self, offset):
1778 return self.__class__(int(self) + offset)
1779
1780 - def _flip(self, length):
1781 return BeforePosition(length - int(self))
1782
1783 1784 -class OneOfPosition(int, AbstractPosition):
1785 """Specify a position where the location can be multiple positions. 1786 1787 This models the GenBank 'one-of(1888,1901)' function, and tries 1788 to make this fit within the Biopython Position models. If this was 1789 a start position it should act like 1888, but as an end position 1901. 1790 1791 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) 1792 >>> p 1793 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) 1794 >>> int(p) 1795 1888 1796 1797 Interget comparisons and operators act like using int(p), 1798 1799 >>> p == 1888 1800 True 1801 >>> p <= 1888 1802 True 1803 >>> p > 1888 1804 False 1805 >>> p + 100 1806 1988 1807 1808 >>> isinstance(p, OneOfPosition) 1809 True 1810 >>> isinstance(p, AbstractPosition) 1811 True 1812 >>> isinstance(p, int) 1813 True 1814 1815 The old legacy properties of position and extension give the 1816 starting/lowest/left-most position as an integer, and the 1817 distance to the ending/highest/right-most position as an integer. 1818 Note that the position object will act like one of the list of 1819 possible locations depending on how it was created: 1820 1821 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)]) 1822 >>> p.position == p2.position == 1888 1823 True 1824 >>> p.extension == p2.extension == 13 1825 True 1826 >>> int(p) == int(p2) 1827 False 1828 >>> p == 1888 1829 True 1830 >>> p2 == 1901 1831 True 1832 1833 """
1834 - def __new__(cls, position, choices):
1835 """Initialize with a set of posssible positions. 1836 1837 position_list is a list of AbstractPosition derived objects, 1838 specifying possible locations. 1839 1840 position is an integer specifying the default behaviour. 1841 """ 1842 assert position in choices, \ 1843 "OneOfPosition: %r should match one of %r" % (position, choices) 1844 obj = int.__new__(cls, position) 1845 obj.position_choices = choices 1846 return obj
1847 1848 @property
1849 - def position(self):
1850 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1851 return min(int(pos) for pos in self.position_choices)
1852 1853 @property
1854 - def extension(self):
1855 """Legacy attribute to get extension as integer (OBSOLETE).""" 1856 positions = [int(pos) for pos in self.position_choices] 1857 return max(positions) - min(positions)
1858
1859 - def __repr__(self):
1860 """String representation of the OneOfPosition location for debugging.""" 1861 return "%s(%i, choices=%r)" % (self.__class__.__name__, 1862 int(self), self.position_choices)
1863
1864 - def __str__(self):
1865 out = "one-of(" 1866 for position in self.position_choices: 1867 out += "%s," % position 1868 # replace the last comma with the closing parenthesis 1869 out = out[:-1] + ")" 1870 return out
1871
1872 - def _shift(self, offset):
1873 return self.__class__(int(self) + offset, 1874 [p._shift(offset) for p in self.position_choices])
1875
1876 - def _flip(self, length):
1877 return self.__class__(length - int(self), 1878 [p._flip(length) for p in self.position_choices[::-1]])
1879
1880 1881 -class PositionGap(object):
1882 """Simple class to hold information about a gap between positions.""" 1883
1884 - def __init__(self, gap_size):
1885 """Intialize with a position object containing the gap information. 1886 """ 1887 self.gap_size = gap_size
1888
1889 - def __repr__(self):
1890 """A string representation of the position gap for debugging.""" 1891 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
1892
1893 - def __str__(self):
1894 out = "gap(%s)" % self.gap_size 1895 return out
1896 1897 1898 if __name__ == "__main__": 1899 from Bio._utils import run_doctest 1900 run_doctest() 1901