Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

   1  # Copyright 2000-2003 Jeff Chang. 
   2  # Copyright 2001-2008 Brad Chapman. 
   3  # Copyright 2005-2016 by Peter Cock. 
   4  # Copyright 2006-2009 Michiel de Hoon. 
   5  # All rights reserved. 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  """Represent a Sequence Feature holding info about a part of a sequence. 
  10   
  11  This is heavily modeled after the Biocorba SeqFeature objects, and 
  12  may be pretty biased towards GenBank stuff since I'm writing it 
  13  for the GenBank parser output... 
  14   
  15  What's here: 
  16   
  17  Base class to hold a Feature 
  18  ---------------------------- 
  19   
  20  classes: 
  21   
  22      - SeqFeature 
  23   
  24  Hold information about a Reference 
  25  ---------------------------------- 
  26   
  27  This is an attempt to create a General class to hold Reference type 
  28  information. 
  29   
  30  classes: 
  31   
  32      - Reference 
  33   
  34  Specify locations of a feature on a Sequence 
  35  -------------------------------------------- 
  36   
  37  This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. 
  38  This has the advantages of allowing us to handle fuzzy stuff in case anyone 
  39  needs it, and also be compatible with BioPerl etc and BioSQL. 
  40   
  41  classes: 
  42   
  43      - FeatureLocation - Specify the start and end location of a feature. 
  44      - CompoundLocation - Collection of FeatureLocation objects (for joins etc). 
  45   
  46      - ExactPosition - Specify the position as being exact. 
  47      - WithinPosition - Specify a position occurring within some range. 
  48      - BetweenPosition - Specify a position occurring between a range (OBSOLETE?). 
  49      - BeforePosition - Specify the position as being found before some base. 
  50      - AfterPosition - Specify the position as being found after some base. 
  51      - OneOfPosition - Specify a position where the location can be multiple positions. 
  52      - UnknownPosition - Represents missing information like '?' in UniProt. 
  53  """ 
  54   
  55  from __future__ import print_function 
  56   
  57  from collections import OrderedDict 
  58   
  59  from Bio._py3k import _is_int_or_long 
  60   
  61  from Bio.Seq import MutableSeq, reverse_complement 
62 63 64 -class SeqFeature(object):
65 """Represent a Sequence Feature on an object. 66 67 Attributes: 68 69 - location - the location of the feature on the sequence (FeatureLocation) 70 - type - the specified type of the feature (ie. CDS, exon, repeat...) 71 - location_operator - a string specifying how this SeqFeature may 72 be related to others. For example, in the example GenBank feature 73 shown below, the location_operator would be "join". This is a proxy 74 for feature.location.operator and only applies to compound locations. 75 - strand - A value specifying on which strand (of a DNA sequence, for 76 instance) the feature deals with. 1 indicates the plus strand, -1 77 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), 78 while the default of None indicates that strand doesn't apply (dot in GFF3, 79 e.g. features on proteins). Note this is a shortcut for accessing the 80 strand property of the feature's location. 81 - id - A string identifier for the feature. 82 - ref - A reference to another sequence. This could be an accession 83 number for some different sequence. Note this is a shortcut for the 84 reference property of the feature's location. 85 - ref_db - A different database for the reference accession number. 86 Note this is a shortcut for the reference property of the location 87 - qualifiers - A dictionary of qualifiers on the feature. These are 88 analogous to the qualifiers from a GenBank feature table. The keys of 89 the dictionary are qualifier names, the values are the qualifier 90 values. As of Biopython 1.69 this is an ordered dictionary. 91 """ 92
93 - def __init__(self, location=None, type='', location_operator='', 94 strand=None, id="<unknown id>", 95 qualifiers=None, sub_features=None, 96 ref=None, ref_db=None):
97 """Initialize a SeqFeature on a Sequence. 98 99 location can either be a FeatureLocation (with strand argument also 100 given if required), or None. 101 102 e.g. With no strand, on the forward strand, and on the reverse strand: 103 104 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 105 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain") 106 >>> f1.strand == f1.location.strand == None 107 True 108 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS") 109 >>> f2.strand == f2.location.strand == +1 110 True 111 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS") 112 >>> f3.strand == f3.location.strand == -1 113 True 114 115 An invalid strand will trigger an exception: 116 117 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2) 118 Traceback (most recent call last): 119 ... 120 ValueError: Strand should be +1, -1, 0 or None, not 2 121 122 Similarly if set via the FeatureLocation directly: 123 124 >>> loc4 = FeatureLocation(50, 60, strand=2) 125 Traceback (most recent call last): 126 ... 127 ValueError: Strand should be +1, -1, 0 or None, not 2 128 129 For exact start/end positions, an integer can be used (as shown above) 130 as shorthand for the ExactPosition object. For non-exact locations, the 131 FeatureLocation must be specified via the appropriate position objects. 132 133 Note that the strand, ref and ref_db arguments to the SeqFeature are 134 now obsolete and will be deprecated in a future release (which will 135 give warning messages) and later removed. Set them via the location 136 object instead. 137 138 Note that location_operator and sub_features arguments can no longer 139 be used, instead do this via the CompoundLocation object. 140 """ 141 if location is not None and not isinstance(location, FeatureLocation) \ 142 and not isinstance(location, CompoundLocation): 143 raise TypeError( 144 "FeatureLocation, CompoundLocation (or None) required for the location") 145 self.location = location 146 self.type = type 147 if location_operator: 148 # TODO - Deprecation warning 149 self.location_operator = location_operator 150 if strand is not None: 151 # TODO - Deprecation warning 152 self.strand = strand 153 self.id = id 154 if qualifiers is None: 155 qualifiers = OrderedDict() 156 self.qualifiers = qualifiers 157 if sub_features is not None: 158 raise TypeError("Rather than sub_features, use a CompoundFeatureLocation") 159 if ref is not None: 160 # TODO - Deprecation warning 161 self.ref = ref 162 if ref_db is not None: 163 # TODO - Deprecation warning 164 self.ref_db = ref_db
165
166 - def _get_strand(self):
167 return self.location.strand
168
169 - def _set_strand(self, value):
170 try: 171 self.location.strand = value 172 except AttributeError: 173 if self.location is None: 174 if value is not None: 175 raise ValueError("Can't set strand without a location.") 176 else: 177 raise
178 179 strand = property(fget=_get_strand, fset=_set_strand, 180 doc="""Feature's strand 181 182 This is a shortcut for feature.location.strand 183 """) 184
185 - def _get_ref(self):
186 try: 187 return self.location.ref 188 except AttributeError: 189 return None
190
191 - def _set_ref(self, value):
192 try: 193 self.location.ref = value 194 except AttributeError: 195 if self.location is None: 196 if value is not None: 197 raise ValueError("Can't set ref without a location.") 198 else: 199 raise
200 ref = property(fget=_get_ref, fset=_set_ref, 201 doc="""Feature location reference (e.g. accession). 202 203 This is a shortcut for feature.location.ref 204 """) 205
206 - def _get_ref_db(self):
207 try: 208 return self.location.ref_db 209 except AttributeError: 210 return None
211
212 - def _set_ref_db(self, value):
213 self.location.ref_db = value
214 ref_db = property(fget=_get_ref_db, fset=_set_ref_db, 215 doc="""Feature location reference's database. 216 217 This is a shortcut for feature.location.ref_db 218 """) 219
220 - def _get_location_operator(self):
221 try: 222 return self.location.operator 223 except AttributeError: 224 return None
225
226 - def _set_location_operator(self, value):
227 if value: 228 if isinstance(self.location, CompoundLocation): 229 self.location.operator = value 230 elif self.location is None: 231 raise ValueError( 232 "Location is None so can't set its operator (to %r)" % value) 233 else: 234 raise ValueError( 235 "Only CompoundLocation gets an operator (%r)" % value)
236 location_operator = property(fget=_get_location_operator, fset=_set_location_operator, 237 doc="Location operator for compound locations (e.g. join).") 238
239 - def __repr__(self):
240 """A string representation of the record for debugging.""" 241 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 242 if self.type: 243 answer += ", type=%s" % repr(self.type) 244 if self.location_operator: 245 answer += ", location_operator=%s" % repr(self.location_operator) 246 if self.id and self.id != "<unknown id>": 247 answer += ", id=%s" % repr(self.id) 248 if self.ref: 249 answer += ", ref=%s" % repr(self.ref) 250 if self.ref_db: 251 answer += ", ref_db=%s" % repr(self.ref_db) 252 answer += ")" 253 return answer
254
255 - def __str__(self):
256 """A readable summary of the feature intended to be printed to screen. 257 """ 258 out = "type: %s\n" % self.type 259 out += "location: %s\n" % self.location 260 if self.id and self.id != "<unknown id>": 261 out += "id: %s\n" % self.id 262 out += "qualifiers:\n" 263 for qual_key in sorted(self.qualifiers): 264 out += " Key: %s, Value: %s\n" % (qual_key, 265 self.qualifiers[qual_key]) 266 return out
267
268 - def _shift(self, offset):
269 """Returns a copy of the feature with its location shifted (PRIVATE). 270 271 The annotation qaulifiers are copied.""" 272 return SeqFeature(location=self.location._shift(offset), 273 type=self.type, 274 location_operator=self.location_operator, 275 id=self.id, 276 qualifiers=OrderedDict(self.qualifiers.items()))
277
278 - def _flip(self, length):
279 """Returns a copy of the feature with its location flipped (PRIVATE). 280 281 The argument length gives the length of the parent sequence. For 282 example a location 0..20 (+1 strand) with parent length 30 becomes 283 after flipping 10..30 (-1 strand). Strandless (None) or unknown 284 strand (0) remain like that - just their end points are changed. 285 286 The annotation qaulifiers are copied. 287 """ 288 return SeqFeature(location=self.location._flip(length), 289 type=self.type, 290 location_operator=self.location_operator, 291 id=self.id, 292 qualifiers=OrderedDict(self.qualifiers.items()))
293
294 - def extract(self, parent_sequence):
295 """Extract feature sequence from the supplied parent sequence. 296 297 The parent_sequence can be a Seq like object or a string, and will 298 generally return an object of the same type. The exception to this is 299 a MutableSeq as the parent sequence will return a Seq object. 300 301 This should cope with complex locations including complements, joins 302 and fuzzy positions. Even mixed strand features should work! This 303 also covers features on protein sequences (e.g. domains), although 304 here reverse strand features are not permitted. 305 306 >>> from Bio.Seq import Seq 307 >>> from Bio.Alphabet import generic_protein 308 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 309 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 310 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 311 >>> f.extract(seq) 312 Seq('VALIVIC', ProteinAlphabet()) 313 314 If the FeatureLocation is None, e.g. when parsing invalid locus 315 locations in the GenBank parser, extract() will raise a ValueError. 316 317 >>> from Bio.Seq import Seq 318 >>> from Bio.SeqFeature import SeqFeature 319 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 320 >>> f = SeqFeature(None, type="domain") 321 >>> f.extract(seq) 322 Traceback (most recent call last): 323 ... 324 ValueError: The feature's .location is None. Check the sequence file for a valid location. 325 326 Note - currently only compound features of type "join" are supported. 327 """ 328 if self.location is None: 329 raise ValueError("The feature's .location is None. Check the " 330 "sequence file for a valid location.") 331 return self.location.extract(parent_sequence)
332 333 # Python 3:
334 - def __bool__(self):
335 """Boolean value of an instance of this class (True). 336 337 This behaviour is for backwards compatibility, since until the 338 __len__ method was added, a SeqFeature always evaluated as True. 339 340 Note that in comparison, Seq objects, strings, lists, etc, will all 341 evaluate to False if they have length zero. 342 343 WARNING: The SeqFeature may in future evaluate to False when its 344 length is zero (in order to better match normal python behaviour)! 345 """ 346 return True
347 348 # Python 2: 349 __nonzero__ = __bool__ 350
351 - def __len__(self):
352 """Returns the length of the region described by a feature. 353 354 >>> from Bio.Seq import Seq 355 >>> from Bio.Alphabet import generic_protein 356 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 357 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 358 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 359 >>> len(f) 360 7 361 >>> f.extract(seq) 362 Seq('VALIVIC', ProteinAlphabet()) 363 >>> len(f.extract(seq)) 364 7 365 366 This is a proxy for taking the length of the feature's location: 367 368 >>> len(f.location) 369 7 370 371 For simple features this is the same as the region spanned (end 372 position minus start position using Pythonic counting). However, for 373 a compound location (e.g. a CDS as the join of several exons) the 374 gaps are not counted (e.g. introns). This ensures that len(f) matches 375 len(f.extract(parent_seq)), and also makes sure things work properly 376 with features wrapping the origin etc. 377 """ 378 return len(self.location)
379
380 - def __iter__(self):
381 """Iterate over the parent positions within the feature. 382 383 The iteration order is strand aware, and can be thought of as moving 384 along the feature using the parent sequence coordinates: 385 386 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 387 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 388 >>> len(f) 389 5 390 >>> for i in f: print(i) 391 9 392 8 393 7 394 6 395 5 396 >>> list(f) 397 [9, 8, 7, 6, 5] 398 399 This is a proxy for iterating over the location, 400 401 >>> list(f.location) 402 [9, 8, 7, 6, 5] 403 """ 404 return iter(self.location)
405
406 - def __contains__(self, value):
407 """Check if an integer position is within the feature. 408 409 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 410 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 411 >>> len(f) 412 5 413 >>> [i for i in range(15) if i in f] 414 [5, 6, 7, 8, 9] 415 416 For example, to see which features include a SNP position, you could 417 use this: 418 419 >>> from Bio import SeqIO 420 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") 421 >>> for f in record.features: 422 ... if 1750 in f: 423 ... print("%s %s" % (f.type, f.location)) 424 source [0:154478](+) 425 gene [1716:4347](-) 426 tRNA join{[4310:4347](-), [1716:1751](-)} 427 428 Note that for a feature defined as a join of several subfeatures (e.g. 429 the union of several exons) the gaps are not checked (e.g. introns). 430 In this example, the tRNA location is defined in the GenBank file as 431 complement(join(1717..1751,4311..4347)), so that position 1760 falls 432 in the gap: 433 434 >>> for f in record.features: 435 ... if 1760 in f: 436 ... print("%s %s" % (f.type, f.location)) 437 source [0:154478](+) 438 gene [1716:4347](-) 439 440 Note that additional care may be required with fuzzy locations, for 441 example just before a BeforePosition: 442 443 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 444 >>> from Bio.SeqFeature import BeforePosition 445 >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain") 446 >>> len(f) 447 5 448 >>> [i for i in range(10) if i in f] 449 [3, 4, 5, 6, 7] 450 451 Note that is is a proxy for testing membership on the location. 452 453 >>> [i for i in range(10) if i in f.location] 454 [3, 4, 5, 6, 7] 455 """ 456 return value in self.location
457
458 459 # --- References 460 461 462 # TODO -- Will this hold PubMed and Medline information decently? 463 -class Reference(object):
464 """Represent a Generic Reference object. 465 466 Attributes: 467 o location - A list of Location objects specifying regions of 468 the sequence that the references correspond to. If no locations are 469 specified, the entire sequence is assumed. 470 o authors - A big old string, or a list split by author, of authors 471 for the reference. 472 o title - The title of the reference. 473 o journal - Journal the reference was published in. 474 o medline_id - A medline reference for the article. 475 o pubmed_id - A pubmed reference for the article. 476 o comment - A place to stick any comments about the reference. 477 """ 478
479 - def __init__(self):
480 self.location = [] 481 self.authors = '' 482 self.consrtm = '' 483 self.title = '' 484 self.journal = '' 485 self.medline_id = '' 486 self.pubmed_id = '' 487 self.comment = ''
488
489 - def __str__(self):
490 """Output an informative string for debugging. 491 """ 492 out = "" 493 for single_location in self.location: 494 out += "location: %s\n" % single_location 495 out += "authors: %s\n" % self.authors 496 if self.consrtm: 497 out += "consrtm: %s\n" % self.consrtm 498 out += "title: %s\n" % self.title 499 out += "journal: %s\n" % self.journal 500 out += "medline id: %s\n" % self.medline_id 501 out += "pubmed id: %s\n" % self.pubmed_id 502 out += "comment: %s\n" % self.comment 503 return out
504
505 - def __repr__(self):
506 # TODO - Update this is __init__ later accpets values 507 return "%s(title=%s, ...)" % (self.__class__.__name__, 508 repr(self.title))
509
510 - def __eq__(self, other):
511 """Check if two Reference objects should be considered equal 512 513 Note that the location is not compared, as __eq__ for the 514 FeatureLocation class is not defined. 515 """ 516 return self.authors == other.authors and \ 517 self.consrtm == other.consrtm and \ 518 self.title == other.title and \ 519 self.journal == other.journal and \ 520 self.medline_id == other.medline_id and \ 521 self.pubmed_id == other.pubmed_id and \ 522 self.comment == other.comment
523
524 525 # --- Handling feature locations 526 527 -class FeatureLocation(object):
528 """Specify the location of a feature along a sequence. 529 530 The FeatureLocation is used for simple continuous features, which can 531 be described as running from a start position to and end position 532 (optionally with a strand and reference information). More complex 533 locations made up from several non-continuous parts (e.g. a coding 534 sequence made up of several exons) are described using a SeqFeature 535 with a CompoundLocation. 536 537 Note that the start and end location numbering follow Python's scheme, 538 thus a GenBank entry of 123..150 (one based counting) becomes a location 539 of [122:150] (zero based counting). 540 541 >>> from Bio.SeqFeature import FeatureLocation 542 >>> f = FeatureLocation(122, 150) 543 >>> print(f) 544 [122:150] 545 >>> print(f.start) 546 122 547 >>> print(f.end) 548 150 549 >>> print(f.strand) 550 None 551 552 Note the strand defaults to None. If you are working with nucleotide 553 sequences you'd want to be explicit if it is the forward strand: 554 555 >>> from Bio.SeqFeature import FeatureLocation 556 >>> f = FeatureLocation(122, 150, strand=+1) 557 >>> print(f) 558 [122:150](+) 559 >>> print(f.strand) 560 1 561 562 Note that for a parent sequence of length n, the FeatureLocation 563 start and end must satisfy the inequality 0 <= start <= end <= n. 564 This means even for features on the reverse strand of a nucleotide 565 sequence, we expect the 'start' coordinate to be less than the 566 'end'. 567 568 >>> from Bio.SeqFeature import FeatureLocation 569 >>> r = FeatureLocation(122, 150, strand=-1) 570 >>> print(r) 571 [122:150](-) 572 >>> print(r.start) 573 122 574 >>> print(r.end) 575 150 576 >>> print(r.strand) 577 -1 578 579 i.e. Rather than thinking of the 'start' and 'end' biologically in a 580 strand aware manor, think of them as the 'left most' or 'minimum' 581 boundary, and the 'right most' or 'maximum' boundary of the region 582 being described. This is particularly important with compound 583 locations describing non-continuous regions. 584 585 In the example above we have used standard exact positions, but there 586 are also specialised position objects used to represent fuzzy positions 587 as well, for example a GenBank location like complement(<123..150) 588 would use a BeforePosition object for the start. 589 """ 590
591 - def __init__(self, start, end, strand=None, ref=None, ref_db=None):
592 """Specify the start, end, strand etc of a sequence feature. 593 594 start and end arguments specify the values where the feature begins 595 and ends. These can either by any of the ``*Position`` objects that 596 inherit from AbstractPosition, or can just be integers specifying the 597 position. In the case of integers, the values are assumed to be 598 exact and are converted in ExactPosition arguments. This is meant 599 to make it easy to deal with non-fuzzy ends. 600 601 i.e. Short form: 602 603 >>> from Bio.SeqFeature import FeatureLocation 604 >>> loc = FeatureLocation(5, 10, strand=-1) 605 >>> print(loc) 606 [5:10](-) 607 608 Explicit form: 609 610 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 611 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1) 612 >>> print(loc) 613 [5:10](-) 614 615 Other fuzzy positions are used similarly, 616 617 >>> from Bio.SeqFeature import FeatureLocation 618 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 619 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1) 620 >>> print(loc2) 621 [<5:>10](-) 622 623 For nucleotide features you will also want to specify the strand, 624 use 1 for the forward (plus) strand, -1 for the reverse (negative) 625 strand, 0 for stranded but strand unknown (? in GFF3), or None for 626 when the strand does not apply (dot in GFF3), e.g. features on 627 proteins. 628 629 >>> loc = FeatureLocation(5, 10, strand=+1) 630 >>> print(loc) 631 [5:10](+) 632 >>> print(loc.strand) 633 1 634 635 Normally feature locations are given relative to the parent 636 sequence you are working with, but an explicit accession can 637 be given with the optional ref and db_ref strings: 638 639 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1) 640 >>> print(loc) 641 AL391218.9[105172:108462](+) 642 >>> print(loc.ref) 643 AL391218.9 644 645 """ 646 # TODO - Check 0 <= start <= end (<= length of reference) 647 if isinstance(start, AbstractPosition): 648 self._start = start 649 elif _is_int_or_long(start): 650 self._start = ExactPosition(start) 651 else: 652 raise TypeError("start=%r %s" % (start, type(start))) 653 if isinstance(end, AbstractPosition): 654 self._end = end 655 elif _is_int_or_long(end): 656 self._end = ExactPosition(end) 657 else: 658 raise TypeError("end=%r %s" % (end, type(end))) 659 self.strand = strand 660 self.ref = ref 661 self.ref_db = ref_db
662
663 - def _get_strand(self):
664 return self._strand
665
666 - def _set_strand(self, value):
667 if value not in [+1, -1, 0, None]: 668 raise ValueError("Strand should be +1, -1, 0 or None, not %r" 669 % value) 670 self._strand = value
671 672 strand = property(fget=_get_strand, fset=_set_strand, 673 doc="Strand of the location (+1, -1, 0 or None).") 674
675 - def __str__(self):
676 """Returns a representation of the location (with python counting). 677 678 For the simple case this uses the python splicing syntax, [122:150] 679 (zero based counting) which GenBank would call 123..150 (one based 680 counting). 681 """ 682 answer = "[%s:%s]" % (self._start, self._end) 683 if self.ref and self.ref_db: 684 answer = "%s:%s%s" % (self.ref_db, self.ref, answer) 685 elif self.ref: 686 answer = self.ref + answer 687 # Is ref_db without ref meaningful? 688 if self.strand is None: 689 return answer 690 elif self.strand == +1: 691 return answer + "(+)" 692 elif self.strand == -1: 693 return answer + "(-)" 694 else: 695 # strand = 0, stranded but strand unknown, ? in GFF3 696 return answer + "(?)"
697
698 - def __repr__(self):
699 """A string representation of the location for debugging.""" 700 optional = "" 701 if self.strand is not None: 702 optional += ", strand=%r" % self.strand 703 if self.ref is not None: 704 optional += ", ref=%r" % self.ref 705 if self.ref_db is not None: 706 optional += ", ref_db=%r" % self.ref_db 707 return "%s(%r, %r%s)" \ 708 % (self.__class__.__name__, self.start, self.end, optional)
709
710 - def __add__(self, other):
711 """Combine location with another feature location, or shift it. 712 713 You can add two feature locations to make a join CompoundLocation: 714 715 >>> from Bio.SeqFeature import FeatureLocation 716 >>> f1 = FeatureLocation(5, 10) 717 >>> f2 = FeatureLocation(20, 30) 718 >>> combined = f1 + f2 719 >>> print(combined) 720 join{[5:10], [20:30]} 721 722 This is thus equivalent to: 723 724 >>> from Bio.SeqFeature import CompoundLocation 725 >>> join = CompoundLocation([f1, f2]) 726 >>> print(join) 727 join{[5:10], [20:30]} 728 729 You can also use sum(...) in this way: 730 731 >>> join = sum([f1, f2]) 732 >>> print(join) 733 join{[5:10], [20:30]} 734 735 Furthermore, you can combine a FeatureLocation with a CompoundLocation 736 in this way. 737 738 Separately, adding an integer will give a new FeatureLocation with 739 its start and end offset by that amount. For example: 740 741 >>> print(f1) 742 [5:10] 743 >>> print(f1 + 100) 744 [105:110] 745 >>> print(200 + f1) 746 [205:210] 747 748 This can be useful when editing annotation. 749 """ 750 if isinstance(other, FeatureLocation): 751 return CompoundLocation([self, other]) 752 elif _is_int_or_long(other): 753 return self._shift(other) 754 else: 755 # This will allow CompoundLocation's __radd__ to be called: 756 return NotImplemented
757
758 - def __radd__(self, other):
759 if _is_int_or_long(other): 760 return self._shift(other) 761 else: 762 return NotImplemented
763
764 - def __nonzero__(self):
765 """Returns True regardless of the length of the feature. 766 767 This behaviour is for backwards compatibility, since until the 768 __len__ method was added, a FeatureLocation always evaluated as True. 769 770 Note that in comparison, Seq objects, strings, lists, etc, will all 771 evaluate to False if they have length zero. 772 773 WARNING: The FeatureLocation may in future evaluate to False when its 774 length is zero (in order to better match normal python behaviour)! 775 """ 776 return True
777
778 - def __len__(self):
779 """Returns the length of the region described by the FeatureLocation. 780 781 Note that extra care may be needed for fuzzy locations, e.g. 782 783 >>> from Bio.SeqFeature import FeatureLocation 784 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 785 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 786 >>> len(loc) 787 5 788 """ 789 return int(self._end) - int(self._start)
790
791 - def __contains__(self, value):
792 """Check if an integer position is within the FeatureLocation. 793 794 Note that extra care may be needed for fuzzy locations, e.g. 795 796 >>> from Bio.SeqFeature import FeatureLocation 797 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 798 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 799 >>> len(loc) 800 5 801 >>> [i for i in range(15) if i in loc] 802 [5, 6, 7, 8, 9] 803 """ 804 if not _is_int_or_long(value): 805 raise ValueError("Currently we only support checking for integer " 806 "positions being within a FeatureLocation.") 807 if value < self._start or value >= self._end: 808 return False 809 else: 810 return True
811
812 - def __iter__(self):
813 """Iterate over the parent positions within the FeatureLocation. 814 815 >>> from Bio.SeqFeature import FeatureLocation 816 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 817 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 818 >>> len(loc) 819 5 820 >>> for i in loc: print(i) 821 5 822 6 823 7 824 8 825 9 826 >>> list(loc) 827 [5, 6, 7, 8, 9] 828 >>> [i for i in range(15) if i in loc] 829 [5, 6, 7, 8, 9] 830 831 Note this is strand aware: 832 833 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1) 834 >>> list(loc) 835 [9, 8, 7, 6, 5] 836 """ 837 if self.strand == -1: 838 for i in range(self._end - 1, self._start - 1, -1): 839 yield i 840 else: 841 for i in range(self._start, self._end): 842 yield i
843
844 - def _shift(self, offset):
845 """Returns a copy of the location shifted by the offset (PRIVATE).""" 846 # TODO - What if offset is a fuzzy position? 847 if self.ref or self.ref_db: 848 # TODO - Return self? 849 raise ValueError("Feature references another sequence.") 850 return FeatureLocation(start=self._start._shift(offset), 851 end=self._end._shift(offset), 852 strand=self.strand)
853
854 - def _flip(self, length):
855 """Returns a copy of the location after the parent is reversed (PRIVATE).""" 856 if self.ref or self.ref_db: 857 # TODO - Return self? 858 raise ValueError("Feature references another sequence.") 859 # Note this will flip the start and end too! 860 if self.strand == +1: 861 flip_strand = -1 862 elif self.strand == -1: 863 flip_strand = +1 864 else: 865 # 0 or None 866 flip_strand = self.strand 867 return FeatureLocation(start=self._end._flip(length), 868 end=self._start._flip(length), 869 strand=flip_strand)
870 871 @property
872 - def parts(self):
873 """Read only list of parts (always one, the Feature Location). 874 875 This is a convenience property allowing you to write code handling 876 both simple FeatureLocation objects (with one part) and more complex 877 CompoundLocation objects (with multiple parts) interchangeably. 878 """ 879 return [self]
880 881 @property
882 - def start(self):
883 """Start location - left most (minimum) value, regardless of strand. 884 885 Read only, returns an integer like position object, possibly a fuzzy 886 position. 887 """ 888 return self._start
889 890 @property
891 - def end(self):
892 """End location - right most (maximum) value, regardless of strand. 893 894 Read only, returns an integer like position object, possibly a fuzzy 895 position. 896 """ 897 return self._end
898 899 @property
900 - def nofuzzy_start(self):
901 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 902 903 This is now an alias for int(feature.start), which should be 904 used in preference -- unless you are trying to support old 905 versions of Biopython. 906 """ 907 try: 908 return int(self._start) 909 except TypeError: 910 if isinstance(self._start, UnknownPosition): 911 return None 912 raise
913 914 @property
915 - def nofuzzy_end(self):
916 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 917 918 This is now an alias for int(feature.end), which should be 919 used in preference -- unless you are trying to support old 920 versions of Biopython. 921 """ 922 try: 923 return int(self._end) 924 except TypeError: 925 if isinstance(self._end, UnknownPosition): 926 return None 927 raise
928
929 - def extract(self, parent_sequence):
930 """Extract feature sequence from the supplied parent sequence.""" 931 if self.ref or self.ref_db: 932 # TODO - Take a dictionary as an optional argument? 933 raise ValueError("Feature references another sequence.") 934 if isinstance(parent_sequence, MutableSeq): 935 # This avoids complications with reverse complements 936 # (the MutableSeq reverse complement acts in situ) 937 parent_sequence = parent_sequence.toseq() 938 f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] 939 if self.strand == -1: 940 try: 941 f_seq = f_seq.reverse_complement() 942 except AttributeError: 943 assert isinstance(f_seq, str) 944 f_seq = reverse_complement(f_seq) 945 return f_seq
946
947 948 -class CompoundLocation(object):
949 """For handling joins etc where a feature location has several parts.""" 950
951 - def __init__(self, parts, operator="join"):
952 """Create a compound location with several parts. 953 954 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 955 >>> f1 = FeatureLocation(10, 40, strand=+1) 956 >>> f2 = FeatureLocation(50, 59, strand=+1) 957 >>> f = CompoundLocation([f1, f2]) 958 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) 959 True 960 >>> print(f.operator) 961 join 962 >>> 5 in f 963 False 964 >>> 15 in f 965 True 966 >>> f.strand 967 1 968 969 Notice that the strand of the compound location is computed 970 automatically - in the case of mixed strands on the sub-locations 971 the overall strand is set to None. 972 973 >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1), 974 ... FeatureLocation(10, 13, strand=-1)]) 975 >>> print(f.strand) 976 None 977 >>> len(f) 978 6 979 >>> list(f) 980 [3, 4, 5, 12, 11, 10] 981 982 The example above doing list(f) iterates over the coordinates within the 983 feature. This allows you to use max and min on the location, to find the 984 range covered: 985 986 >>> min(f) 987 3 988 >>> max(f) 989 12 990 991 More generally, you can use the compound location's start and end which 992 give the full range covered, 0 <= start <= end <= full sequence length. 993 994 >>> f.start == min(f) 995 True 996 >>> f.end == max(f) + 1 997 True 998 999 This is consistent with the behaviour of the simple FeatureLocation for 1000 a single region, where again the 'start' and 'end' do not necessarily 1001 give the biological start and end, but rather the 'minimal' and 'maximal' 1002 coordinate boundaries. 1003 1004 Note that adding locations provides a more intuitive method of 1005 construction: 1006 1007 >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1) 1008 >>> len(f) 1009 6 1010 >>> list(f) 1011 [3, 4, 5, 12, 11, 10] 1012 """ 1013 self.operator = operator 1014 self.parts = list(parts) 1015 for loc in self.parts: 1016 if not isinstance(loc, FeatureLocation): 1017 raise ValueError("CompoundLocation should be given a list of " 1018 "FeatureLocation objects, not %s" % loc.__class__) 1019 if len(parts) < 2: 1020 raise ValueError( 1021 "CompoundLocation should have at least 2 parts, not %r" % parts)
1022
1023 - def __str__(self):
1024 """Returns a representation of the location (with python counting).""" 1025 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
1026
1027 - def __repr__(self):
1028 """String representation of the location for debugging.""" 1029 return "%s(%r, %r)" % (self.__class__.__name__, 1030 self.parts, self.operator)
1031
1032 - def _get_strand(self):
1033 # Historically a join on the reverse strand has been represented 1034 # in Biopython with both the parent SeqFeature and its children 1035 # (the exons for a CDS) all given a strand of -1. Likewise, for 1036 # a join feature on the forward strand they all have strand +1. 1037 # However, we must also consider evil mixed strand examples like 1038 # this, join(complement(69611..69724),139856..140087,140625..140650) 1039 if len(set(loc.strand for loc in self.parts)) == 1: 1040 return self.parts[0].strand 1041 else: 1042 return None # i.e. mixed strands
1043
1044 - def _set_strand(self, value):
1045 # Should this be allowed/encouraged? 1046 for loc in self.parts: 1047 loc.strand = value
1048 strand = property(fget=_get_strand, fset=_set_strand, 1049 doc="""Overall strand of the compound location. 1050 1051 If all the parts have the same strand, that is returned. Otherwise 1052 for mixed strands, this returns None. 1053 1054 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1055 >>> f1 = FeatureLocation(15, 17, strand=1) 1056 >>> f2 = FeatureLocation(20, 30, strand=-1) 1057 >>> f = f1 + f2 1058 >>> f1.strand 1059 1 1060 >>> f2.strand 1061 -1 1062 >>> f.strand 1063 >>> f.strand is None 1064 True 1065 1066 If you set the strand of a CompoundLocation, this is applied to 1067 all the parts - use with caution: 1068 1069 >>> f.strand = 1 1070 >>> f1.strand 1071 1 1072 >>> f2.strand 1073 1 1074 >>> f.strand 1075 1 1076 1077 """) 1078
1079 - def __add__(self, other):
1080 """Combine locations, or shift the location by an integer offset. 1081 1082 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1083 >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30) 1084 >>> print(f1) 1085 join{[15:17], [20:30]} 1086 1087 You can add another FeatureLocation: 1088 1089 >>> print(f1 + FeatureLocation(40, 50)) 1090 join{[15:17], [20:30], [40:50]} 1091 >>> print(FeatureLocation(5, 10) + f1) 1092 join{[5:10], [15:17], [20:30]} 1093 1094 You can also add another CompoundLocation: 1095 1096 >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70) 1097 >>> print(f2) 1098 join{[40:50], [60:70]} 1099 >>> print(f1 + f2) 1100 join{[15:17], [20:30], [40:50], [60:70]} 1101 1102 Also, as with the FeatureLocation, adding an integer shifts the 1103 location's co-ordinates by that offset: 1104 1105 >>> print(f1 + 100) 1106 join{[115:117], [120:130]} 1107 >>> print(200 + f1) 1108 join{[215:217], [220:230]} 1109 >>> print(f1 + (-5)) 1110 join{[10:12], [15:25]} 1111 """ 1112 if isinstance(other, FeatureLocation): 1113 return CompoundLocation(self.parts + [other], self.operator) 1114 elif isinstance(other, CompoundLocation): 1115 if self.operator != other.operator: 1116 # Handle join+order -> order as a special case? 1117 raise ValueError("Mixed operators %s and %s" 1118 % (self.operator, other.operator)) 1119 return CompoundLocation(self.parts + other.parts, self.operator) 1120 elif _is_int_or_long(other): 1121 return self._shift(other) 1122 else: 1123 raise NotImplementedError
1124
1125 - def __radd__(self, other):
1126 """Combine locations.""" 1127 if isinstance(other, FeatureLocation): 1128 return CompoundLocation([other] + self.parts, self.operator) 1129 elif _is_int_or_long(other): 1130 return self._shift(other) 1131 else: 1132 raise NotImplementedError
1133
1134 - def __contains__(self, value):
1135 """Check if an integer position is within the location.""" 1136 for loc in self.parts: 1137 if value in loc: 1138 return True 1139 return False
1140
1141 - def __nonzero__(self):
1142 """Returns True regardless of the length of the feature. 1143 1144 This behaviour is for backwards compatibility, since until the 1145 __len__ method was added, a FeatureLocation always evaluated as True. 1146 1147 Note that in comparison, Seq objects, strings, lists, etc, will all 1148 evaluate to False if they have length zero. 1149 1150 WARNING: The FeatureLocation may in future evaluate to False when its 1151 length is zero (in order to better match normal python behaviour)! 1152 """ 1153 return True
1154
1155 - def __len__(self):
1156 return sum(len(loc) for loc in self.parts)
1157
1158 - def __iter__(self):
1159 for loc in self.parts: 1160 for pos in loc: 1161 yield pos
1162
1163 - def _shift(self, offset):
1164 """Returns a copy of the location shifted by the offset (PRIVATE).""" 1165 return CompoundLocation([loc._shift(offset) for loc in self.parts], 1166 self.operator)
1167
1168 - def _flip(self, length):
1169 """Returns a copy of the location after the parent is reversed (PRIVATE). 1170 1171 Note that the order of the parts is NOT reversed too. Consider a CDS 1172 on the forward strand with exons small, medium and large (in length). 1173 Once we change the frame of reference to the reverse complement strand, 1174 the start codon is still part of the small exon, and the stop codon 1175 still part of the large exon - so the part order remains the same! 1176 1177 Here is an artificial example, were the features map to the two upper 1178 case regions and the lower case runs of n are not used: 1179 1180 >>> from Bio.Seq import Seq 1181 >>> from Bio.SeqFeature import FeatureLocation 1182 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") 1183 >>> small = FeatureLocation(5, 20, strand=1) 1184 >>> large = FeatureLocation(28, 52, strand=1) 1185 >>> location = small + large 1186 >>> print(small) 1187 [5:20](+) 1188 >>> print(large) 1189 [28:52](+) 1190 >>> print(location) 1191 join{[5:20](+), [28:52](+)} 1192 >>> for part in location.parts: 1193 ... print(len(part)) 1194 ... 1195 15 1196 24 1197 1198 As you can see, this is a silly example where each "exon" is a word: 1199 1200 >>> print(small.extract(dna).translate()) 1201 SILLY 1202 >>> print(large.extract(dna).translate()) 1203 EXAMPLE* 1204 >>> print(location.extract(dna).translate()) 1205 SILLYEXAMPLE* 1206 >>> for part in location.parts: 1207 ... print(part.extract(dna).translate()) 1208 ... 1209 SILLY 1210 EXAMPLE* 1211 1212 Now, let's look at this from the reverse strand frame of reference: 1213 1214 >>> flipped_dna = dna.reverse_complement() 1215 >>> flipped_location = location._flip(len(dna)) 1216 >>> print(flipped_location.extract(flipped_dna).translate()) 1217 SILLYEXAMPLE* 1218 >>> for part in flipped_location.parts: 1219 ... print(part.extract(flipped_dna).translate()) 1220 ... 1221 SILLY 1222 EXAMPLE* 1223 1224 The key point here is the first part of the CompoundFeature is still the 1225 small exon, while the second part is still the large exon: 1226 1227 >>> for part in flipped_location.parts: 1228 ... print(len(part)) 1229 ... 1230 15 1231 24 1232 >>> print(flipped_location) 1233 join{[37:52](-), [5:29](-)} 1234 1235 Notice the parts are not reversed. However, there was a bug here in older 1236 versions of Biopython which would have given join{[5:29](-), [37:52](-)} 1237 and the translation would have wrongly been "EXAMPLE*SILLY" instead. 1238 1239 """ 1240 return CompoundLocation([loc._flip(length) for loc in self.parts], 1241 self.operator)
1242 1243 @property
1244 - def start(self):
1245 """Start location - left most (minimum) value, regardless of strand. 1246 1247 Read only, returns an integer like position object, possibly a fuzzy 1248 position. 1249 1250 For the special case of a CompoundLocation wrapping the origin of a 1251 circular genome, this will return zero. 1252 """ 1253 return min(loc.start for loc in self.parts)
1254 1255 @property
1256 - def end(self):
1257 """End location - right most (maximum) value, regardless of strand. 1258 1259 Read only, returns an integer like position object, possibly a fuzzy 1260 position. 1261 1262 For the special case of a CompoundLocation wrapping the origin of 1263 a circular genome this will match the genome length (minus one 1264 given how Python counts from zero). 1265 """ 1266 return max(loc.end for loc in self.parts)
1267 1268 @property
1269 - def nofuzzy_start(self):
1270 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1271 1272 This is an alias for int(feature.start), which should be used in 1273 preference -- unless you are trying to support old versions of 1274 Biopython. 1275 """ 1276 try: 1277 return int(self.start) 1278 except TypeError: 1279 if isinstance(self.start, UnknownPosition): 1280 return None 1281 raise
1282 1283 @property
1284 - def nofuzzy_end(self):
1285 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1286 1287 This is an alias for int(feature.end), which should be used in 1288 preference -- unless you are trying to support old versions of 1289 Biopython. 1290 """ 1291 try: 1292 return int(self.end) 1293 except TypeError: 1294 if isinstance(self.end, UnknownPosition): 1295 return None 1296 raise
1297 1298 @property
1299 - def ref(self):
1300 """CompoundLocation's don't have a ref (dummy method for API compatibility).""" 1301 return None
1302 1303 @property
1304 - def ref_db(self):
1305 """CompoundLocation's don't have a ref_db (dummy method for API compatibility).""" 1306 return None
1307
1308 - def extract(self, parent_sequence):
1309 """Extract feature sequence from the supplied parent sequence.""" 1310 # This copes with mixed strand features & all on reverse: 1311 parts = [loc.extract(parent_sequence) for loc in self.parts] 1312 # We use addition rather than a join to avoid alphabet issues: 1313 f_seq = parts[0] 1314 for part in parts[1:]: 1315 f_seq += part 1316 return f_seq
1317
1318 1319 -class AbstractPosition(object):
1320 """Abstract base class representing a position.""" 1321
1322 - def __repr__(self):
1323 """String representation of the location for debugging.""" 1324 return "%s(...)" % (self.__class__.__name__)
1325
1326 1327 -class ExactPosition(int, AbstractPosition):
1328 """Specify the specific position of a boundary. 1329 1330 o position - The position of the boundary. 1331 o extension - An optional argument which must be zero since we don't 1332 have an extension. The argument is provided so that the same number of 1333 arguments can be passed to all position types. 1334 1335 In this case, there is no fuzziness associated with the position. 1336 1337 >>> p = ExactPosition(5) 1338 >>> p 1339 ExactPosition(5) 1340 >>> print(p) 1341 5 1342 1343 >>> isinstance(p, AbstractPosition) 1344 True 1345 >>> isinstance(p, int) 1346 True 1347 1348 Integer comparisons and operations should work as expected: 1349 1350 >>> p == 5 1351 True 1352 >>> p < 6 1353 True 1354 >>> p <= 5 1355 True 1356 >>> p + 10 1357 15 1358 1359 """
1360 - def __new__(cls, position, extension=0):
1361 if extension != 0: 1362 raise AttributeError("Non-zero extension %s for exact position." 1363 % extension) 1364 return int.__new__(cls, position)
1365
1366 - def __repr__(self):
1367 """String representation of the ExactPosition location for debugging.""" 1368 return "%s(%i)" % (self.__class__.__name__, int(self))
1369 1370 @property
1371 - def position(self):
1372 """Legacy attribute to get position as integer (OBSOLETE).""" 1373 return int(self)
1374 1375 @property
1376 - def extension(self):
1377 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1378 return 0
1379
1380 - def _shift(self, offset):
1381 # By default preserve any subclass 1382 return self.__class__(int(self) + offset)
1383
1384 - def _flip(self, length):
1385 # By default perserve any subclass 1386 return self.__class__(length - int(self))
1387
1388 1389 -class UncertainPosition(ExactPosition):
1390 """Specify a specific position which is uncertain. 1391 1392 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the 1393 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. 1394 """ 1395 pass
1396
1397 1398 -class UnknownPosition(AbstractPosition):
1399 """Specify a specific position which is unknown (has no position). 1400 1401 This is used in UniProt, e.g. ? or in the XML as unknown. 1402 """ 1403
1404 - def __repr__(self):
1405 """String representation of the UnknownPosition location for debugging.""" 1406 return "%s()" % self.__class__.__name__
1407
1408 - def __hash__(self):
1409 return hash(None)
1410 1411 @property
1412 - def position(self):
1413 """Legacy attribute to get position (None) (OBSOLETE).""" 1414 return None
1415 1416 @property
1417 - def extension(self):
1418 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1419 return 0
1420
1421 - def _shift(self, offset):
1422 return self
1423
1424 - def _flip(self, length):
1425 return self
1426
1427 1428 -class WithinPosition(int, AbstractPosition):
1429 """Specify the position of a boundary within some coordinates. 1430 1431 Arguments: 1432 o position - The default integer position 1433 o left - The start (left) position of the boundary 1434 o right - The end (right) position of the boundary 1435 1436 This allows dealing with a position like ((1.4)..100). This 1437 indicates that the start of the sequence is somewhere between 1 1438 and 4. Since this is a start coordinate, it should acts like 1439 it is at position 1 (or in Python counting, 0). 1440 1441 >>> p = WithinPosition(10, 10, 13) 1442 >>> p 1443 WithinPosition(10, left=10, right=13) 1444 >>> print(p) 1445 (10.13) 1446 >>> int(p) 1447 10 1448 1449 Basic integer comparisons and operations should work as though 1450 this were a plain integer: 1451 1452 >>> p == 10 1453 True 1454 >>> p in [9, 10, 11] 1455 True 1456 >>> p < 11 1457 True 1458 >>> p + 10 1459 20 1460 1461 >>> isinstance(p, WithinPosition) 1462 True 1463 >>> isinstance(p, AbstractPosition) 1464 True 1465 >>> isinstance(p, int) 1466 True 1467 1468 Note this also applies for comparison to other position objects, 1469 where again the integer behaviour is used: 1470 1471 >>> p == 10 1472 True 1473 >>> p == ExactPosition(10) 1474 True 1475 >>> p == BeforePosition(10) 1476 True 1477 >>> p == AfterPosition(10) 1478 True 1479 1480 If this were an end point, you would want the position to be 13: 1481 1482 >>> p2 = WithinPosition(13, 10, 13) 1483 >>> p2 1484 WithinPosition(13, left=10, right=13) 1485 >>> print(p2) 1486 (10.13) 1487 >>> int(p2) 1488 13 1489 >>> p2 == 13 1490 True 1491 >>> p2 == ExactPosition(13) 1492 True 1493 1494 The old legacy properties of position and extension give the 1495 starting/lower/left position as an integer, and the distance 1496 to the ending/higher/right position as an integer. Note that 1497 the position object will act like either the left or the right 1498 end-point depending on how it was created: 1499 1500 >>> p.position == p2.position == 10 1501 True 1502 >>> p.extension == p2.extension == 3 1503 True 1504 >>> int(p) == int(p2) 1505 False 1506 >>> p == 10 1507 True 1508 >>> p2 == 13 1509 True 1510 1511 """
1512 - def __new__(cls, position, left, right):
1513 assert position == left or position == right, \ 1514 "WithinPosition: %r should match left %r or right %r" \ 1515 % (position, left, right) 1516 obj = int.__new__(cls, position) 1517 obj._left = left 1518 obj._right = right 1519 return obj
1520
1521 - def __repr__(self):
1522 """String representation of the WithinPosition location for debugging.""" 1523 return "%s(%i, left=%i, right=%i)" \ 1524 % (self.__class__.__name__, int(self), 1525 self._left, self._right)
1526
1527 - def __str__(self):
1528 return "(%s.%s)" % (self._left, self._right)
1529 1530 @property
1531 - def position(self):
1532 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1533 return self._left
1534 1535 @property
1536 - def extension(self):
1537 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1538 return self._right - self._left
1539
1540 - def _shift(self, offset):
1541 return self.__class__(int(self) + offset, 1542 self._left + offset, 1543 self._right + offset)
1544
1545 - def _flip(self, length):
1546 return self.__class__(length - int(self), 1547 length - self._right, 1548 length - self._left)
1549
1550 1551 -class BetweenPosition(int, AbstractPosition):
1552 """Specify the position of a boundary between two coordinates (OBSOLETE?). 1553 1554 Arguments: 1555 o position - The default integer position 1556 o left - The start (left) position of the boundary 1557 o right - The end (right) position of the boundary 1558 1559 This allows dealing with a position like 123^456. This 1560 indicates that the start of the sequence is somewhere between 1561 123 and 456. It is up to the parser to set the position argument 1562 to either boundary point (depending on if this is being used as 1563 a start or end of the feature). For example as a feature end: 1564 1565 >>> p = BetweenPosition(456, 123, 456) 1566 >>> p 1567 BetweenPosition(456, left=123, right=456) 1568 >>> print(p) 1569 (123^456) 1570 >>> int(p) 1571 456 1572 1573 Integer equality and comparison use the given position, 1574 1575 >>> p == 456 1576 True 1577 >>> p in [455, 456, 457] 1578 True 1579 >>> p > 300 1580 True 1581 1582 The old legacy properties of position and extension give the 1583 starting/lower/left position as an integer, and the distance 1584 to the ending/higher/right position as an integer. Note that 1585 the position object will act like either the left or the right 1586 end-point depending on how it was created: 1587 1588 >>> p2 = BetweenPosition(123, left=123, right=456) 1589 >>> p.position == p2.position == 123 1590 True 1591 >>> p.extension 1592 333 1593 >>> p2.extension 1594 333 1595 >>> p.extension == p2.extension == 333 1596 True 1597 >>> int(p) == int(p2) 1598 False 1599 >>> p == 456 1600 True 1601 >>> p2 == 123 1602 True 1603 1604 Note this potentially surprising behaviour: 1605 1606 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) 1607 True 1608 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) 1609 True 1610 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) 1611 True 1612 1613 i.e. For equality (and sorting) the position objects behave like 1614 integers. 1615 """
1616 - def __new__(cls, position, left, right):
1617 assert position == left or position == right 1618 obj = int.__new__(cls, position) 1619 obj._left = left 1620 obj._right = right 1621 return obj
1622
1623 - def __repr__(self):
1624 """String representation of the WithinPosition location for debugging.""" 1625 return "%s(%i, left=%i, right=%i)" \ 1626 % (self.__class__.__name__, int(self), 1627 self._left, self._right)
1628
1629 - def __str__(self):
1630 return "(%s^%s)" % (self._left, self._right)
1631 1632 @property
1633 - def position(self):
1634 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1635 return self._left
1636 1637 @property
1638 - def extension(self):
1639 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1640 return self._right - self._left
1641
1642 - def _shift(self, offset):
1643 return self.__class__(int(self) + offset, 1644 self._left + offset, 1645 self._right + offset)
1646
1647 - def _flip(self, length):
1648 return self.__class__(length - int(self), 1649 length - self._right, 1650 length - self._left)
1651
1652 1653 -class BeforePosition(int, AbstractPosition):
1654 """Specify a position where the actual location occurs before it. 1655 1656 Arguments: 1657 o position - The upper boundary of where the location can occur. 1658 o extension - An optional argument which must be zero since we don't 1659 have an extension. The argument is provided so that the same number of 1660 arguments can be passed to all position types. 1661 1662 This is used to specify positions like (<10..100) where the location 1663 occurs somewhere before position 10. 1664 1665 >>> p = BeforePosition(5) 1666 >>> p 1667 BeforePosition(5) 1668 >>> print(p) 1669 <5 1670 >>> int(p) 1671 5 1672 >>> p + 10 1673 15 1674 1675 Note this potentially surprising behaviour: 1676 1677 >>> p == ExactPosition(5) 1678 True 1679 >>> p == AfterPosition(5) 1680 True 1681 1682 Just remember that for equality and sorting the position objects act 1683 like integers. 1684 """ 1685 # Subclasses int so can't use __init__
1686 - def __new__(cls, position, extension=0):
1687 if extension != 0: 1688 raise AttributeError("Non-zero extension %s for exact position." 1689 % extension) 1690 return int.__new__(cls, position)
1691 1692 @property
1693 - def position(self):
1694 """Legacy attribute to get position as integer (OBSOLETE).""" 1695 return int(self)
1696 1697 @property
1698 - def extension(self):
1699 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1700 return 0
1701
1702 - def __repr__(self):
1703 """A string representation of the location for debugging.""" 1704 return "%s(%i)" % (self.__class__.__name__, int(self))
1705
1706 - def __str__(self):
1707 return "<%s" % self.position
1708
1709 - def _shift(self, offset):
1710 return self.__class__(int(self) + offset)
1711
1712 - def _flip(self, length):
1713 return AfterPosition(length - int(self))
1714
1715 1716 -class AfterPosition(int, AbstractPosition):
1717 """Specify a position where the actual location is found after it. 1718 1719 Arguments: 1720 o position - The lower boundary of where the location can occur. 1721 o extension - An optional argument which must be zero since we don't 1722 have an extension. The argument is provided so that the same number of 1723 arguments can be passed to all position types. 1724 1725 This is used to specify positions like (>10..100) where the location 1726 occurs somewhere after position 10. 1727 1728 >>> p = AfterPosition(7) 1729 >>> p 1730 AfterPosition(7) 1731 >>> print(p) 1732 >7 1733 >>> int(p) 1734 7 1735 >>> p + 10 1736 17 1737 1738 >>> isinstance(p, AfterPosition) 1739 True 1740 >>> isinstance(p, AbstractPosition) 1741 True 1742 >>> isinstance(p, int) 1743 True 1744 1745 Note this potentially surprising behaviour: 1746 1747 >>> p == ExactPosition(7) 1748 True 1749 >>> p == BeforePosition(7) 1750 True 1751 1752 Just remember that for equality and sorting the position objects act 1753 like integers. 1754 """ 1755 # Subclasses int so can't use __init__
1756 - def __new__(cls, position, extension=0):
1757 if extension != 0: 1758 raise AttributeError("Non-zero extension %s for exact position." 1759 % extension) 1760 return int.__new__(cls, position)
1761 1762 @property
1763 - def position(self):
1764 """Legacy attribute to get position as integer (OBSOLETE).""" 1765 return int(self)
1766 1767 @property
1768 - def extension(self):
1769 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1770 return 0
1771
1772 - def __repr__(self):
1773 """A string representation of the location for debugging.""" 1774 return "%s(%i)" % (self.__class__.__name__, int(self))
1775
1776 - def __str__(self):
1777 return ">%s" % self.position
1778
1779 - def _shift(self, offset):
1780 return self.__class__(int(self) + offset)
1781
1782 - def _flip(self, length):
1783 return BeforePosition(length - int(self))
1784
1785 1786 -class OneOfPosition(int, AbstractPosition):
1787 """Specify a position where the location can be multiple positions. 1788 1789 This models the GenBank 'one-of(1888,1901)' function, and tries 1790 to make this fit within the Biopython Position models. If this was 1791 a start position it should act like 1888, but as an end position 1901. 1792 1793 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) 1794 >>> p 1795 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) 1796 >>> int(p) 1797 1888 1798 1799 Interget comparisons and operators act like using int(p), 1800 1801 >>> p == 1888 1802 True 1803 >>> p <= 1888 1804 True 1805 >>> p > 1888 1806 False 1807 >>> p + 100 1808 1988 1809 1810 >>> isinstance(p, OneOfPosition) 1811 True 1812 >>> isinstance(p, AbstractPosition) 1813 True 1814 >>> isinstance(p, int) 1815 True 1816 1817 The old legacy properties of position and extension give the 1818 starting/lowest/left-most position as an integer, and the 1819 distance to the ending/highest/right-most position as an integer. 1820 Note that the position object will act like one of the list of 1821 possible locations depending on how it was created: 1822 1823 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)]) 1824 >>> p.position == p2.position == 1888 1825 True 1826 >>> p.extension == p2.extension == 13 1827 True 1828 >>> int(p) == int(p2) 1829 False 1830 >>> p == 1888 1831 True 1832 >>> p2 == 1901 1833 True 1834 1835 """
1836 - def __new__(cls, position, choices):
1837 """Initialize with a set of posssible positions. 1838 1839 position_list is a list of AbstractPosition derived objects, 1840 specifying possible locations. 1841 1842 position is an integer specifying the default behaviour. 1843 """ 1844 assert position in choices, \ 1845 "OneOfPosition: %r should match one of %r" % (position, choices) 1846 obj = int.__new__(cls, position) 1847 obj.position_choices = choices 1848 return obj
1849 1850 @property
1851 - def position(self):
1852 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1853 return min(int(pos) for pos in self.position_choices)
1854 1855 @property
1856 - def extension(self):
1857 """Legacy attribute to get extension as integer (OBSOLETE).""" 1858 positions = [int(pos) for pos in self.position_choices] 1859 return max(positions) - min(positions)
1860
1861 - def __repr__(self):
1862 """String representation of the OneOfPosition location for debugging.""" 1863 return "%s(%i, choices=%r)" % (self.__class__.__name__, 1864 int(self), self.position_choices)
1865
1866 - def __str__(self):
1867 out = "one-of(" 1868 for position in self.position_choices: 1869 out += "%s," % position 1870 # replace the last comma with the closing parenthesis 1871 out = out[:-1] + ")" 1872 return out
1873
1874 - def _shift(self, offset):
1875 return self.__class__(int(self) + offset, 1876 [p._shift(offset) for p in self.position_choices])
1877
1878 - def _flip(self, length):
1879 return self.__class__(length - int(self), 1880 [p._flip(length) for p in self.position_choices[::-1]])
1881
1882 1883 -class PositionGap(object):
1884 """Simple class to hold information about a gap between positions.""" 1885
1886 - def __init__(self, gap_size):
1887 """Intialize with a position object containing the gap information. 1888 """ 1889 self.gap_size = gap_size
1890
1891 - def __repr__(self):
1892 """A string representation of the position gap for debugging.""" 1893 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
1894
1895 - def __str__(self):
1896 out = "gap(%s)" % self.gap_size 1897 return out
1898 1899 1900 if __name__ == "__main__": 1901 from Bio._utils import run_doctest 1902 run_doctest() 1903