Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

   1  # Copyright 2000-2003 Jeff Chang. 
   2  # Copyright 2001-2008 Brad Chapman. 
   3  # Copyright 2005-2015 by Peter Cock. 
   4  # Copyright 2006-2009 Michiel de Hoon. 
   5  # All rights reserved. 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  """Represent a Sequence Feature holding info about a part of a sequence. 
  10   
  11  This is heavily modeled after the Biocorba SeqFeature objects, and 
  12  may be pretty biased towards GenBank stuff since I'm writing it 
  13  for the GenBank parser output... 
  14   
  15  What's here: 
  16   
  17  Base class to hold a Feature 
  18  ---------------------------- 
  19   
  20  classes: 
  21   
  22      - SeqFeature 
  23   
  24  Hold information about a Reference 
  25  ---------------------------------- 
  26   
  27  This is an attempt to create a General class to hold Reference type 
  28  information. 
  29   
  30  classes: 
  31   
  32      - Reference 
  33   
  34  Specify locations of a feature on a Sequence 
  35  -------------------------------------------- 
  36   
  37  This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. 
  38  This has the advantages of allowing us to handle fuzzy stuff in case anyone 
  39  needs it, and also be compatible with BioPerl etc and BioSQL. 
  40   
  41  classes: 
  42   
  43      - FeatureLocation - Specify the start and end location of a feature. 
  44      - CompoundLocation - Collection of FeatureLocation objects (for joins etc). 
  45   
  46      - ExactPosition - Specify the position as being exact. 
  47      - WithinPosition - Specify a position occurring within some range. 
  48      - BetweenPosition - Specify a position occurring between a range (OBSOLETE?). 
  49      - BeforePosition - Specify the position as being found before some base. 
  50      - AfterPosition - Specify the position as being found after some base. 
  51      - OneOfPosition - Specify a position where the location can be multiple positions. 
  52      - UnknownPosition - Represents missing information like '?' in UniProt. 
  53  """ 
  54   
  55  from __future__ import print_function 
  56   
  57  from Bio._py3k import _is_int_or_long 
  58   
  59  from Bio.Seq import MutableSeq, reverse_complement 
60 61 62 -class SeqFeature(object):
63 """Represent a Sequence Feature on an object. 64 65 Attributes: 66 67 - location - the location of the feature on the sequence (FeatureLocation) 68 - type - the specified type of the feature (ie. CDS, exon, repeat...) 69 - location_operator - a string specifying how this SeqFeature may 70 be related to others. For example, in the example GenBank feature 71 shown below, the location_operator would be "join". This is a proxy 72 for feature.location.operator and only applies to compound locations. 73 - strand - A value specifying on which strand (of a DNA sequence, for 74 instance) the feature deals with. 1 indicates the plus strand, -1 75 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), 76 while the default of None indicates that strand doesn't apply (dot in GFF3, 77 e.g. features on proteins). Note this is a shortcut for accessing the 78 strand property of the feature's location. 79 - id - A string identifier for the feature. 80 - ref - A reference to another sequence. This could be an accession 81 number for some different sequence. Note this is a shortcut for the 82 reference property of the feature's location. 83 - ref_db - A different database for the reference accession number. 84 Note this is a shortcut for the reference property of the location 85 - qualifiers - A dictionary of qualifiers on the feature. These are 86 analogous to the qualifiers from a GenBank feature table. The keys of 87 the dictionary are qualifier names, the values are the qualifier 88 values. 89 - sub_features - Obsolete list of additional SeqFeatures which was 90 used for holding compound locations (e.g. joins in GenBank/EMBL). 91 This is now superceded by a CompoundLocation as the location, and 92 should not be used (DEPRECATED). 93 """ 94
95 - def __init__(self, location=None, type='', location_operator='', 96 strand=None, id="<unknown id>", 97 qualifiers=None, sub_features=None, 98 ref=None, ref_db=None):
99 """Initialize a SeqFeature on a Sequence. 100 101 location can either be a FeatureLocation (with strand argument also 102 given if required), or None. 103 104 e.g. With no strand, on the forward strand, and on the reverse strand: 105 106 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 107 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain") 108 >>> f1.strand == f1.location.strand == None 109 True 110 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS") 111 >>> f2.strand == f2.location.strand == +1 112 True 113 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS") 114 >>> f3.strand == f3.location.strand == -1 115 True 116 117 An invalid strand will trigger an exception: 118 119 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2) 120 Traceback (most recent call last): 121 ... 122 ValueError: Strand should be +1, -1, 0 or None, not 2 123 124 Similarly if set via the FeatureLocation directly: 125 126 >>> loc4 = FeatureLocation(50, 60, strand=2) 127 Traceback (most recent call last): 128 ... 129 ValueError: Strand should be +1, -1, 0 or None, not 2 130 131 For exact start/end positions, an integer can be used (as shown above) 132 as shorthand for the ExactPosition object. For non-exact locations, the 133 FeatureLocation must be specified via the appropriate position objects. 134 135 Note that the strand, ref and ref_db arguments to the SeqFeature are 136 now obsolete and will be deprecated in a future release (which will 137 give warning messages) and later removed. Set them via the location 138 object instead. 139 140 Note that location_operator and sub_features arguments can no longer 141 be used, instead do this via the CompoundLocation object. 142 """ 143 if location is not None and not isinstance(location, FeatureLocation) \ 144 and not isinstance(location, CompoundLocation): 145 raise TypeError( 146 "FeatureLocation, CompoundLocation (or None) required for the location") 147 self.location = location 148 self.type = type 149 if location_operator: 150 # TODO - Deprecation warning 151 self.location_operator = location_operator 152 if strand is not None: 153 # TODO - Deprecation warning 154 self.strand = strand 155 self.id = id 156 if qualifiers is None: 157 qualifiers = {} 158 self.qualifiers = qualifiers 159 if sub_features is None: 160 sub_features = [] 161 else: 162 import warnings 163 from Bio import BiopythonDeprecationWarning 164 warnings.warn("Rather than sub_features, use a CompoundFeatureLocation", 165 BiopythonDeprecationWarning) 166 self._sub_features = sub_features 167 if ref is not None: 168 # TODO - Deprecation warning 169 self.ref = ref 170 if ref_db is not None: 171 # TODO - Deprecation warning 172 self.ref_db = ref_db
173
174 - def _get_sub_features(self):
175 if self._sub_features: 176 import warnings 177 from Bio import BiopythonDeprecationWarning 178 warnings.warn("Rather using f.sub_features, f.location should be a CompoundFeatureLocation", 179 BiopythonDeprecationWarning) 180 return self._sub_features
181
182 - def _set_sub_features(self, value):
183 if value: 184 import warnings 185 from Bio import BiopythonDeprecationWarning 186 warnings.warn("Rather than f.sub_features, use a CompoundFeatureLocation for f.location", 187 BiopythonDeprecationWarning) 188 self._sub_features = value
189 sub_features = property(fget=_get_sub_features, fset=_set_sub_features, 190 doc="Obsolete representation of compound locations (DEPRECATED).") 191
192 - def _get_strand(self):
193 return self.location.strand
194
195 - def _set_strand(self, value):
196 try: 197 self.location.strand = value 198 except AttributeError: 199 if self.location is None: 200 if value is not None: 201 raise ValueError("Can't set strand without a location.") 202 else: 203 raise
204 205 strand = property(fget=_get_strand, fset=_set_strand, 206 doc="""Feature's strand 207 208 This is a shortcut for feature.location.strand 209 """) 210
211 - def _get_ref(self):
212 try: 213 return self.location.ref 214 except AttributeError: 215 return None
216
217 - def _set_ref(self, value):
218 try: 219 self.location.ref = value 220 except AttributeError: 221 if self.location is None: 222 if value is not None: 223 raise ValueError("Can't set ref without a location.") 224 else: 225 raise
226 ref = property(fget=_get_ref, fset=_set_ref, 227 doc="""Feature location reference (e.g. accession). 228 229 This is a shortcut for feature.location.ref 230 """) 231
232 - def _get_ref_db(self):
233 try: 234 return self.location.ref_db 235 except AttributeError: 236 return None
237
238 - def _set_ref_db(self, value):
239 self.location.ref_db = value
240 ref_db = property(fget=_get_ref_db, fset=_set_ref_db, 241 doc="""Feature location reference's database. 242 243 This is a shortcut for feature.location.ref_db 244 """) 245
246 - def _get_location_operator(self):
247 try: 248 return self.location.operator 249 except AttributeError: 250 return None
251
252 - def _set_location_operator(self, value):
253 if value: 254 if isinstance(self.location, CompoundLocation): 255 self.location.operator = value 256 elif self.location is None: 257 raise ValueError( 258 "Location is None so can't set its operator (to %r)" % value) 259 else: 260 raise ValueError( 261 "Only CompoundLocation gets an operator (%r)" % value)
262 location_operator = property(fget=_get_location_operator, fset=_set_location_operator, 263 doc="Location operator for compound locations (e.g. join).") 264
265 - def __repr__(self):
266 """A string representation of the record for debugging.""" 267 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 268 if self.type: 269 answer += ", type=%s" % repr(self.type) 270 if self.location_operator: 271 answer += ", location_operator=%s" % repr(self.location_operator) 272 if self.id and self.id != "<unknown id>": 273 answer += ", id=%s" % repr(self.id) 274 if self.ref: 275 answer += ", ref=%s" % repr(self.ref) 276 if self.ref_db: 277 answer += ", ref_db=%s" % repr(self.ref_db) 278 answer += ")" 279 return answer
280
281 - def __str__(self):
282 """A readable summary of the feature intended to be printed to screen. 283 """ 284 out = "type: %s\n" % self.type 285 out += "location: %s\n" % self.location 286 if self.id and self.id != "<unknown id>": 287 out += "id: %s\n" % self.id 288 out += "qualifiers:\n" 289 for qual_key in sorted(self.qualifiers): 290 out += " Key: %s, Value: %s\n" % (qual_key, 291 self.qualifiers[qual_key]) 292 # TODO - Remove this from __str__ since deprecated 293 if len(self._sub_features) != 0: 294 out += "Sub-Features\n" 295 for sub_feature in self._sub_features: 296 out += "%s\n" % sub_feature 297 return out
298
299 - def _shift(self, offset):
300 """Returns a copy of the feature with its location shifted (PRIVATE). 301 302 The annotation qaulifiers are copied.""" 303 answer = SeqFeature(location=self.location._shift(offset), 304 type=self.type, 305 location_operator=self.location_operator, 306 id=self.id, 307 qualifiers=dict(self.qualifiers.items())) 308 # This is to avoid the deprecation warning: 309 answer._sub_features = [f._shift(offset) for f in self._sub_features] 310 return answer
311
312 - def _flip(self, length):
313 """Returns a copy of the feature with its location flipped (PRIVATE). 314 315 The argument length gives the length of the parent sequence. For 316 example a location 0..20 (+1 strand) with parent length 30 becomes 317 after flipping 10..30 (-1 strand). Strandless (None) or unknown 318 strand (0) remain like that - just their end points are changed. 319 320 The annotation qaulifiers are copied. 321 """ 322 answer = SeqFeature(location=self.location._flip(length), 323 type=self.type, 324 location_operator=self.location_operator, 325 id=self.id, 326 qualifiers=dict(self.qualifiers.items())) 327 # This is to avoid the deprecation warning: 328 answer._sub_features = [f._flip(length) 329 for f in self._sub_features[::-1]] 330 return answer
331
332 - def extract(self, parent_sequence):
333 """Extract feature sequence from the supplied parent sequence. 334 335 The parent_sequence can be a Seq like object or a string, and will 336 generally return an object of the same type. The exception to this is 337 a MutableSeq as the parent sequence will return a Seq object. 338 339 This should cope with complex locations including complements, joins 340 and fuzzy positions. Even mixed strand features should work! This 341 also covers features on protein sequences (e.g. domains), although 342 here reverse strand features are not permitted. 343 344 >>> from Bio.Seq import Seq 345 >>> from Bio.Alphabet import generic_protein 346 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 347 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 348 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 349 >>> f.extract(seq) 350 Seq('VALIVIC', ProteinAlphabet()) 351 352 If the FeatureLocation is None, e.g. when parsing invalid locus 353 locations in the GenBank parser, extract() will raise a ValueError. 354 355 >>> from Bio.Seq import Seq 356 >>> from Bio.SeqFeature import SeqFeature 357 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 358 >>> f = SeqFeature(None, type="domain") 359 >>> f.extract(seq) 360 Traceback (most recent call last): 361 ... 362 ValueError: The feature's .location is None. Check the sequence file for a valid location. 363 364 Note - currently only sub-features of type "join" are supported. 365 """ 366 if self.location is None: 367 raise ValueError("The feature's .location is None. Check the " 368 "sequence file for a valid location.") 369 return self.location.extract(parent_sequence)
370 371 # Python 3:
372 - def __bool__(self):
373 """Boolean value of an instance of this class (True). 374 375 This behaviour is for backwards compatibility, since until the 376 __len__ method was added, a SeqFeature always evaluated as True. 377 378 Note that in comparison, Seq objects, strings, lists, etc, will all 379 evaluate to False if they have length zero. 380 381 WARNING: The SeqFeature may in future evaluate to False when its 382 length is zero (in order to better match normal python behaviour)! 383 """ 384 return True
385 386 # Python 2: 387 __nonzero__ = __bool__ 388
389 - def __len__(self):
390 """Returns the length of the region described by a feature. 391 392 >>> from Bio.Seq import Seq 393 >>> from Bio.Alphabet import generic_protein 394 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 395 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 396 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 397 >>> len(f) 398 7 399 >>> f.extract(seq) 400 Seq('VALIVIC', ProteinAlphabet()) 401 >>> len(f.extract(seq)) 402 7 403 404 This is a proxy for taking the length of the feature's location: 405 406 >>> len(f.location) 407 7 408 409 For simple features this is the same as the region spanned (end 410 position minus start position using Pythonic counting). However, for 411 a compound location (e.g. a CDS as the join of several exons) the 412 gaps are not counted (e.g. introns). This ensures that len(f) matches 413 len(f.extract(parent_seq)), and also makes sure things work properly 414 with features wrapping the origin etc. 415 """ 416 return len(self.location)
417
418 - def __iter__(self):
419 """Iterate over the parent positions within the feature. 420 421 The iteration order is strand aware, and can be thought of as moving 422 along the feature using the parent sequence coordinates: 423 424 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 425 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 426 >>> len(f) 427 5 428 >>> for i in f: print(i) 429 9 430 8 431 7 432 6 433 5 434 >>> list(f) 435 [9, 8, 7, 6, 5] 436 437 This is a proxy for iterating over the location, 438 439 >>> list(f.location) 440 [9, 8, 7, 6, 5] 441 """ 442 return iter(self.location)
443
444 - def __contains__(self, value):
445 """Check if an integer position is within the feature. 446 447 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 448 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 449 >>> len(f) 450 5 451 >>> [i for i in range(15) if i in f] 452 [5, 6, 7, 8, 9] 453 454 For example, to see which features include a SNP position, you could 455 use this: 456 457 >>> from Bio import SeqIO 458 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") 459 >>> for f in record.features: 460 ... if 1750 in f: 461 ... print("%s %s" % (f.type, f.location)) 462 source [0:154478](+) 463 gene [1716:4347](-) 464 tRNA join{[4310:4347](-), [1716:1751](-)} 465 466 Note that for a feature defined as a join of several subfeatures (e.g. 467 the union of several exons) the gaps are not checked (e.g. introns). 468 In this example, the tRNA location is defined in the GenBank file as 469 complement(join(1717..1751,4311..4347)), so that position 1760 falls 470 in the gap: 471 472 >>> for f in record.features: 473 ... if 1760 in f: 474 ... print("%s %s" % (f.type, f.location)) 475 source [0:154478](+) 476 gene [1716:4347](-) 477 478 Note that additional care may be required with fuzzy locations, for 479 example just before a BeforePosition: 480 481 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 482 >>> from Bio.SeqFeature import BeforePosition 483 >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain") 484 >>> len(f) 485 5 486 >>> [i for i in range(10) if i in f] 487 [3, 4, 5, 6, 7] 488 489 Note that is is a proxy for testing membership on the location. 490 491 >>> [i for i in range(10) if i in f.location] 492 [3, 4, 5, 6, 7] 493 """ 494 return value in self.location
495
496 497 # --- References 498 499 500 # TODO -- Will this hold PubMed and Medline information decently? 501 -class Reference(object):
502 """Represent a Generic Reference object. 503 504 Attributes: 505 o location - A list of Location objects specifying regions of 506 the sequence that the references correspond to. If no locations are 507 specified, the entire sequence is assumed. 508 o authors - A big old string, or a list split by author, of authors 509 for the reference. 510 o title - The title of the reference. 511 o journal - Journal the reference was published in. 512 o medline_id - A medline reference for the article. 513 o pubmed_id - A pubmed reference for the article. 514 o comment - A place to stick any comments about the reference. 515 """ 516
517 - def __init__(self):
518 self.location = [] 519 self.authors = '' 520 self.consrtm = '' 521 self.title = '' 522 self.journal = '' 523 self.medline_id = '' 524 self.pubmed_id = '' 525 self.comment = ''
526
527 - def __str__(self):
528 """Output an informative string for debugging. 529 """ 530 out = "" 531 for single_location in self.location: 532 out += "location: %s\n" % single_location 533 out += "authors: %s\n" % self.authors 534 if self.consrtm: 535 out += "consrtm: %s\n" % self.consrtm 536 out += "title: %s\n" % self.title 537 out += "journal: %s\n" % self.journal 538 out += "medline id: %s\n" % self.medline_id 539 out += "pubmed id: %s\n" % self.pubmed_id 540 out += "comment: %s\n" % self.comment 541 return out
542
543 - def __repr__(self):
544 # TODO - Update this is __init__ later accpets values 545 return "%s(title=%s, ...)" % (self.__class__.__name__, 546 repr(self.title))
547
548 - def __eq__(self, other):
549 """Check if two Reference objects should be considered equal 550 551 Note that the location is not compared, as __eq__ for the 552 FeatureLocation class is not defined. 553 """ 554 return self.authors == other.authors and \ 555 self.consrtm == other.consrtm and \ 556 self.title == other.title and \ 557 self.journal == other.journal and \ 558 self.medline_id == other.medline_id and \ 559 self.pubmed_id == other.pubmed_id and \ 560 self.comment == other.comment
561
562 563 # --- Handling feature locations 564 565 -class FeatureLocation(object):
566 """Specify the location of a feature along a sequence. 567 568 The FeatureLocation is used for simple continuous features, which can 569 be described as running from a start position to and end position 570 (optionally with a strand and reference information). More complex 571 locations made up from several non-continuous parts (e.g. a coding 572 sequence made up of several exons) are currently described using a 573 SeqFeature with sub-features. 574 575 Note that the start and end location numbering follow Python's scheme, 576 thus a GenBank entry of 123..150 (one based counting) becomes a location 577 of [122:150] (zero based counting). 578 579 >>> from Bio.SeqFeature import FeatureLocation 580 >>> f = FeatureLocation(122, 150) 581 >>> print(f) 582 [122:150] 583 >>> print(f.start) 584 122 585 >>> print(f.end) 586 150 587 >>> print(f.strand) 588 None 589 590 Note the strand defaults to None. If you are working with nucleotide 591 sequences you'd want to be explicit if it is the forward strand: 592 593 >>> from Bio.SeqFeature import FeatureLocation 594 >>> f = FeatureLocation(122, 150, strand=+1) 595 >>> print(f) 596 [122:150](+) 597 >>> print(f.strand) 598 1 599 600 Note that for a parent sequence of length n, the FeatureLocation 601 start and end must satisfy the inequality 0 <= start <= end <= n. 602 This means even for features on the reverse strand of a nucleotide 603 sequence, we expect the 'start' coordinate to be less than the 604 'end'. 605 606 >>> from Bio.SeqFeature import FeatureLocation 607 >>> r = FeatureLocation(122, 150, strand=-1) 608 >>> print(r) 609 [122:150](-) 610 >>> print(r.start) 611 122 612 >>> print(r.end) 613 150 614 >>> print(r.strand) 615 -1 616 617 i.e. Rather than thinking of the 'start' and 'end' biologically in a 618 strand aware manor, think of them as the 'left most' or 'minimum' 619 boundary, and the 'right most' or 'maximum' boundary of the region 620 being described. This is particularly important with compound 621 locations describing non-continuous regions. 622 623 In the example above we have used standard exact positions, but there 624 are also specialised position objects used to represent fuzzy positions 625 as well, for example a GenBank location like complement(<123..150) 626 would use a BeforePosition object for the start. 627 """ 628
629 - def __init__(self, start, end, strand=None, ref=None, ref_db=None):
630 """Specify the start, end, strand etc of a sequence feature. 631 632 start and end arguments specify the values where the feature begins 633 and ends. These can either by any of the ``*Position`` objects that 634 inherit from AbstractPosition, or can just be integers specifying the 635 position. In the case of integers, the values are assumed to be 636 exact and are converted in ExactPosition arguments. This is meant 637 to make it easy to deal with non-fuzzy ends. 638 639 i.e. Short form: 640 641 >>> from Bio.SeqFeature import FeatureLocation 642 >>> loc = FeatureLocation(5, 10, strand=-1) 643 >>> print(loc) 644 [5:10](-) 645 646 Explicit form: 647 648 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 649 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1) 650 >>> print(loc) 651 [5:10](-) 652 653 Other fuzzy positions are used similarly, 654 655 >>> from Bio.SeqFeature import FeatureLocation 656 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 657 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1) 658 >>> print(loc2) 659 [<5:>10](-) 660 661 For nucleotide features you will also want to specify the strand, 662 use 1 for the forward (plus) strand, -1 for the reverse (negative) 663 strand, 0 for stranded but strand unknown (? in GFF3), or None for 664 when the strand does not apply (dot in GFF3), e.g. features on 665 proteins. 666 667 >>> loc = FeatureLocation(5, 10, strand=+1) 668 >>> print(loc) 669 [5:10](+) 670 >>> print(loc.strand) 671 1 672 673 Normally feature locations are given relative to the parent 674 sequence you are working with, but an explicit accession can 675 be given with the optional ref and db_ref strings: 676 677 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1) 678 >>> print(loc) 679 AL391218.9[105172:108462](+) 680 >>> print(loc.ref) 681 AL391218.9 682 683 """ 684 # TODO - Check 0 <= start <= end (<= length of reference) 685 if isinstance(start, AbstractPosition): 686 self._start = start 687 elif _is_int_or_long(start): 688 self._start = ExactPosition(start) 689 else: 690 raise TypeError("start=%r %s" % (start, type(start))) 691 if isinstance(end, AbstractPosition): 692 self._end = end 693 elif _is_int_or_long(end): 694 self._end = ExactPosition(end) 695 else: 696 raise TypeError("end=%r %s" % (end, type(end))) 697 self.strand = strand 698 self.ref = ref 699 self.ref_db = ref_db
700
701 - def _get_strand(self):
702 return self._strand
703
704 - def _set_strand(self, value):
705 if value not in [+1, -1, 0, None]: 706 raise ValueError("Strand should be +1, -1, 0 or None, not %r" 707 % value) 708 self._strand = value
709 710 strand = property(fget=_get_strand, fset=_set_strand, 711 doc="Strand of the location (+1, -1, 0 or None).") 712
713 - def __str__(self):
714 """Returns a representation of the location (with python counting). 715 716 For the simple case this uses the python splicing syntax, [122:150] 717 (zero based counting) which GenBank would call 123..150 (one based 718 counting). 719 """ 720 answer = "[%s:%s]" % (self._start, self._end) 721 if self.ref and self.ref_db: 722 answer = "%s:%s%s" % (self.ref_db, self.ref, answer) 723 elif self.ref: 724 answer = self.ref + answer 725 # Is ref_db without ref meaningful? 726 if self.strand is None: 727 return answer 728 elif self.strand == +1: 729 return answer + "(+)" 730 elif self.strand == -1: 731 return answer + "(-)" 732 else: 733 # strand = 0, stranded but strand unknown, ? in GFF3 734 return answer + "(?)"
735
736 - def __repr__(self):
737 """A string representation of the location for debugging.""" 738 optional = "" 739 if self.strand is not None: 740 optional += ", strand=%r" % self.strand 741 if self.ref is not None: 742 optional += ", ref=%r" % self.ref 743 if self.ref_db is not None: 744 optional += ", ref_db=%r" % self.ref_db 745 return "%s(%r, %r%s)" \ 746 % (self.__class__.__name__, self.start, self.end, optional)
747
748 - def __add__(self, other):
749 """Combine location with another feature location, or shift it. 750 751 You can add two feature locations to make a join CompoundLocation: 752 753 >>> from Bio.SeqFeature import FeatureLocation 754 >>> f1 = FeatureLocation(5, 10) 755 >>> f2 = FeatureLocation(20, 30) 756 >>> combined = f1 + f2 757 >>> print(combined) 758 join{[5:10], [20:30]} 759 760 This is thus equivalent to: 761 762 >>> from Bio.SeqFeature import CompoundLocation 763 >>> join = CompoundLocation([f1, f2]) 764 >>> print(join) 765 join{[5:10], [20:30]} 766 767 You can also use sum(...) in this way: 768 769 >>> join = sum([f1, f2]) 770 >>> print(join) 771 join{[5:10], [20:30]} 772 773 Furthermore, you can combine a FeatureLocation with a CompoundLocation 774 in this way. 775 776 Separately, adding an integer will give a new FeatureLocation with 777 its start and end offset by that amount. For example: 778 779 >>> print(f1) 780 [5:10] 781 >>> print(f1 + 100) 782 [105:110] 783 >>> print(200 + f1) 784 [205:210] 785 786 This can be useful when editing annotation. 787 """ 788 if isinstance(other, FeatureLocation): 789 return CompoundLocation([self, other]) 790 elif isinstance(other, int): 791 return self._shift(other) 792 else: 793 # This will allow CompoundLocation's __radd__ to be called: 794 return NotImplemented
795
796 - def __radd__(self, other):
797 if isinstance(other, int): 798 return self._shift(other) 799 else: 800 return NotImplemented
801
802 - def __nonzero__(self):
803 """Returns True regardless of the length of the feature. 804 805 This behaviour is for backwards compatibility, since until the 806 __len__ method was added, a FeatureLocation always evaluated as True. 807 808 Note that in comparison, Seq objects, strings, lists, etc, will all 809 evaluate to False if they have length zero. 810 811 WARNING: The FeatureLocation may in future evaluate to False when its 812 length is zero (in order to better match normal python behaviour)! 813 """ 814 return True
815
816 - def __len__(self):
817 """Returns the length of the region described by the FeatureLocation. 818 819 Note that extra care may be needed for fuzzy locations, e.g. 820 821 >>> from Bio.SeqFeature import FeatureLocation 822 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 823 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 824 >>> len(loc) 825 5 826 """ 827 return int(self._end) - int(self._start)
828
829 - def __contains__(self, value):
830 """Check if an integer position is within the FeatureLocation. 831 832 Note that extra care may be needed for fuzzy locations, e.g. 833 834 >>> from Bio.SeqFeature import FeatureLocation 835 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 836 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 837 >>> len(loc) 838 5 839 >>> [i for i in range(15) if i in loc] 840 [5, 6, 7, 8, 9] 841 """ 842 if not isinstance(value, int): 843 raise ValueError("Currently we only support checking for integer " 844 "positions being within a FeatureLocation.") 845 if value < self._start or value >= self._end: 846 return False 847 else: 848 return True
849
850 - def __iter__(self):
851 """Iterate over the parent positions within the FeatureLocation. 852 853 >>> from Bio.SeqFeature import FeatureLocation 854 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 855 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 856 >>> len(loc) 857 5 858 >>> for i in loc: print(i) 859 5 860 6 861 7 862 8 863 9 864 >>> list(loc) 865 [5, 6, 7, 8, 9] 866 >>> [i for i in range(15) if i in loc] 867 [5, 6, 7, 8, 9] 868 869 Note this is strand aware: 870 871 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1) 872 >>> list(loc) 873 [9, 8, 7, 6, 5] 874 """ 875 if self.strand == -1: 876 for i in range(self._end - 1, self._start - 1, -1): 877 yield i 878 else: 879 for i in range(self._start, self._end): 880 yield i
881
882 - def _shift(self, offset):
883 """Returns a copy of the location shifted by the offset (PRIVATE).""" 884 # TODO - What if offset is a fuzzy position? 885 if self.ref or self.ref_db: 886 # TODO - Return self? 887 raise ValueError("Feature references another sequence.") 888 return FeatureLocation(start=self._start._shift(offset), 889 end=self._end._shift(offset), 890 strand=self.strand)
891
892 - def _flip(self, length):
893 """Returns a copy of the location after the parent is reversed (PRIVATE).""" 894 if self.ref or self.ref_db: 895 # TODO - Return self? 896 raise ValueError("Feature references another sequence.") 897 # Note this will flip the start and end too! 898 if self.strand == +1: 899 flip_strand = -1 900 elif self.strand == -1: 901 flip_strand = +1 902 else: 903 # 0 or None 904 flip_strand = self.strand 905 return FeatureLocation(start=self._end._flip(length), 906 end=self._start._flip(length), 907 strand=flip_strand)
908 909 @property
910 - def parts(self):
911 """Read only list of parts (always one, the Feature Location). 912 913 This is a convenience property allowing you to write code handling 914 both simple FeatureLocation objects (with one part) and more complex 915 CompoundLocation objects (with multiple parts) interchangeably. 916 """ 917 return [self]
918 919 @property
920 - def start(self):
921 """Start location (integer like, possibly a fuzzy position, read only).""" 922 return self._start
923 924 @property
925 - def end(self):
926 """End location (integer like, possibly a fuzzy position, read only).""" 927 return self._end
928 929 @property
930 - def nofuzzy_start(self):
931 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 932 933 This is now an alias for int(feature.start), which should be 934 used in preference -- unless you are trying to support old 935 versions of Biopython. 936 """ 937 try: 938 return int(self._start) 939 except TypeError: 940 if isinstance(self._start, UnknownPosition): 941 return None 942 raise
943 944 @property
945 - def nofuzzy_end(self):
946 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 947 948 This is now an alias for int(feature.end), which should be 949 used in preference -- unless you are trying to support old 950 versions of Biopython. 951 """ 952 try: 953 return int(self._end) 954 except TypeError: 955 if isinstance(self._end, UnknownPosition): 956 return None 957 raise
958
959 - def extract(self, parent_sequence):
960 """Extract feature sequence from the supplied parent sequence.""" 961 if self.ref or self.ref_db: 962 # TODO - Take a dictionary as an optional argument? 963 raise ValueError("Feature references another sequence.") 964 if isinstance(parent_sequence, MutableSeq): 965 # This avoids complications with reverse complements 966 # (the MutableSeq reverse complement acts in situ) 967 parent_sequence = parent_sequence.toseq() 968 f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] 969 if self.strand == -1: 970 try: 971 f_seq = f_seq.reverse_complement() 972 except AttributeError: 973 assert isinstance(f_seq, str) 974 f_seq = reverse_complement(f_seq) 975 return f_seq
976
977 978 -class CompoundLocation(object):
979 """For handling joins etc where a feature location has several parts.""" 980
981 - def __init__(self, parts, operator="join"):
982 """Create a compound location with several parts. 983 984 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 985 >>> f1 = FeatureLocation(10, 40, strand=+1) 986 >>> f2 = FeatureLocation(50, 59, strand=+1) 987 >>> f = CompoundLocation([f1, f2]) 988 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) 989 True 990 >>> print(f.operator) 991 join 992 >>> 5 in f 993 False 994 >>> 15 in f 995 True 996 >>> f.strand 997 1 998 999 Notice that the strand of the compound location is computed 1000 automatically - in the case of mixed strands on the sub-locations 1001 the overall strand is set to None. 1002 1003 >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1), 1004 ... FeatureLocation(10, 13, strand=-1)]) 1005 >>> print(f.strand) 1006 None 1007 >>> len(f) 1008 6 1009 >>> list(f) 1010 [3, 4, 5, 12, 11, 10] 1011 1012 The example above doing list(f) iterates over the coordinates within the 1013 feature. This allows you to use max and min on the location, to find the 1014 range covered: 1015 1016 >>> min(f) 1017 3 1018 >>> max(f) 1019 12 1020 1021 More generally, you can use the compound location's start and end which 1022 give the full range covered, 0 <= start <= end <= full sequence length. 1023 1024 >>> f.start == min(f) 1025 True 1026 >>> f.end == max(f) + 1 1027 True 1028 1029 This is consistent with the behaviour of the simple FeatureLocation for 1030 a single region, where again the 'start' and 'end' do not necessarily 1031 give the biological start and end, but rather the 'minimal' and 'maximal' 1032 coordinate boundaries. 1033 1034 Note that adding locations provides a more intuitive method of 1035 construction: 1036 1037 >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1) 1038 >>> len(f) 1039 6 1040 >>> list(f) 1041 [3, 4, 5, 12, 11, 10] 1042 """ 1043 self.operator = operator 1044 self.parts = list(parts) 1045 for loc in self.parts: 1046 if not isinstance(loc, FeatureLocation): 1047 raise ValueError("CompoundLocation should be given a list of " 1048 "FeatureLocation objects, not %s" % loc.__class__) 1049 if len(parts) < 2: 1050 raise ValueError( 1051 "CompoundLocation should have at least 2 parts, not %r" % parts)
1052
1053 - def __str__(self):
1054 """Returns a representation of the location (with python counting).""" 1055 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
1056
1057 - def __repr__(self):
1058 """String representation of the location for debugging.""" 1059 return "%s(%r, %r)" % (self.__class__.__name__, 1060 self.parts, self.operator)
1061
1062 - def _get_strand(self):
1063 # Historically a join on the reverse strand has been represented 1064 # in Biopython with both the parent SeqFeature and its children 1065 # (the exons for a CDS) all given a strand of -1. Likewise, for 1066 # a join feature on the forward strand they all have strand +1. 1067 # However, we must also consider evil mixed strand examples like 1068 # this, join(complement(69611..69724),139856..140087,140625..140650) 1069 if len(set(loc.strand for loc in self.parts)) == 1: 1070 return self.parts[0].strand 1071 else: 1072 return None # i.e. mixed strands
1073
1074 - def _set_strand(self, value):
1075 # Should this be allowed/encouraged? 1076 for loc in self.parts: 1077 loc.strand = value
1078 strand = property(fget=_get_strand, fset=_set_strand, 1079 doc="""Overall strand of the compound location. 1080 1081 If all the parts have the same strand, that is returned. Otherwise 1082 for mixed strands, this returns None. 1083 1084 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1085 >>> f1 = FeatureLocation(15, 17, strand=1) 1086 >>> f2 = FeatureLocation(20, 30, strand=-1) 1087 >>> f = f1 + f2 1088 >>> f1.strand 1089 1 1090 >>> f2.strand 1091 -1 1092 >>> f.strand 1093 >>> f.strand is None 1094 True 1095 1096 If you set the strand of a CompoundLocation, this is applied to 1097 all the parts - use with caution: 1098 1099 >>> f.strand = 1 1100 >>> f1.strand 1101 1 1102 >>> f2.strand 1103 1 1104 >>> f.strand 1105 1 1106 1107 """) 1108
1109 - def __add__(self, other):
1110 """Combine locations, or shift the location by an integer offset. 1111 1112 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1113 >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30) 1114 >>> print(f1) 1115 join{[15:17], [20:30]} 1116 1117 You can add another FeatureLocation: 1118 1119 >>> print(f1 + FeatureLocation(40, 50)) 1120 join{[15:17], [20:30], [40:50]} 1121 >>> print(FeatureLocation(5, 10) + f1) 1122 join{[5:10], [15:17], [20:30]} 1123 1124 You can also add another CompoundLocation: 1125 1126 >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70) 1127 >>> print(f2) 1128 join{[40:50], [60:70]} 1129 >>> print(f1 + f2) 1130 join{[15:17], [20:30], [40:50], [60:70]} 1131 1132 Also, as with the FeatureLocation, adding an integer shifts the 1133 location's co-ordinates by that offset: 1134 1135 >>> print(f1 + 100) 1136 join{[115:117], [120:130]} 1137 >>> print(200 + f1) 1138 join{[215:217], [220:230]} 1139 >>> print(f1 + (-5)) 1140 join{[10:12], [15:25]} 1141 """ 1142 if isinstance(other, FeatureLocation): 1143 return CompoundLocation(self.parts + [other], self.operator) 1144 elif isinstance(other, CompoundLocation): 1145 if self.operator != other.operator: 1146 # Handle join+order -> order as a special case? 1147 raise ValueError("Mixed operators %s and %s" 1148 % (self.operator, other.operator)) 1149 return CompoundLocation(self.parts + other.parts, self.operator) 1150 elif isinstance(other, int): 1151 return self._shift(other) 1152 else: 1153 raise NotImplementedError
1154
1155 - def __radd__(self, other):
1156 """Combine locations.""" 1157 if isinstance(other, FeatureLocation): 1158 return CompoundLocation([other] + self.parts, self.operator) 1159 elif isinstance(other, int): 1160 return self._shift(other) 1161 else: 1162 raise NotImplementedError
1163
1164 - def __contains__(self, value):
1165 """Check if an integer position is within the location.""" 1166 for loc in self.parts: 1167 if value in loc: 1168 return True 1169 return False
1170
1171 - def __nonzero__(self):
1172 """Returns True regardless of the length of the feature. 1173 1174 This behaviour is for backwards compatibility, since until the 1175 __len__ method was added, a FeatureLocation always evaluated as True. 1176 1177 Note that in comparison, Seq objects, strings, lists, etc, will all 1178 evaluate to False if they have length zero. 1179 1180 WARNING: The FeatureLocation may in future evaluate to False when its 1181 length is zero (in order to better match normal python behaviour)! 1182 """ 1183 return True
1184
1185 - def __len__(self):
1186 return sum(len(loc) for loc in self.parts)
1187
1188 - def __iter__(self):
1189 for loc in self.parts: 1190 for pos in loc: 1191 yield pos
1192
1193 - def _shift(self, offset):
1194 """Returns a copy of the location shifted by the offset (PRIVATE).""" 1195 return CompoundLocation([loc._shift(offset) for loc in self.parts], 1196 self.operator)
1197
1198 - def _flip(self, length):
1199 """Returns a copy of the location after the parent is reversed (PRIVATE). 1200 1201 Note that the order of the parts is NOT reversed too. Consider a CDS 1202 on the forward strand with exons small, medium and large (in length). 1203 Once we change the frame of reference to the reverse complement strand, 1204 the start codon is still part of the small exon, and the stop codon 1205 still part of the large exon - so the part order remains the same! 1206 1207 Here is an artificial example, were the features map to the two upper 1208 case regions and the lower case runs of n are not used: 1209 1210 >>> from Bio.Seq import Seq 1211 >>> from Bio.SeqFeature import FeatureLocation 1212 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") 1213 >>> small = FeatureLocation(5, 20, strand=1) 1214 >>> large = FeatureLocation(28, 52, strand=1) 1215 >>> location = small + large 1216 >>> print(small) 1217 [5:20](+) 1218 >>> print(large) 1219 [28:52](+) 1220 >>> print(location) 1221 join{[5:20](+), [28:52](+)} 1222 >>> for part in location.parts: 1223 ... print(len(part)) 1224 ... 1225 15 1226 24 1227 1228 As you can see, this is a silly example where each "exon" is a word: 1229 1230 >>> print(small.extract(dna).translate()) 1231 SILLY 1232 >>> print(large.extract(dna).translate()) 1233 EXAMPLE* 1234 >>> print(location.extract(dna).translate()) 1235 SILLYEXAMPLE* 1236 >>> for part in location.parts: 1237 ... print(part.extract(dna).translate()) 1238 ... 1239 SILLY 1240 EXAMPLE* 1241 1242 Now, let's look at this from the reverse strand frame of reference: 1243 1244 >>> flipped_dna = dna.reverse_complement() 1245 >>> flipped_location = location._flip(len(dna)) 1246 >>> print(flipped_location.extract(flipped_dna).translate()) 1247 SILLYEXAMPLE* 1248 >>> for part in flipped_location.parts: 1249 ... print(part.extract(flipped_dna).translate()) 1250 ... 1251 SILLY 1252 EXAMPLE* 1253 1254 The key point here is the first part of the CompoundFeature is still the 1255 small exon, while the second part is still the large exon: 1256 1257 >>> for part in flipped_location.parts: 1258 ... print(len(part)) 1259 ... 1260 15 1261 24 1262 >>> print(flipped_location) 1263 join{[37:52](-), [5:29](-)} 1264 1265 Notice the parts are not reversed. However, there was a bug here in older 1266 versions of Biopython which would have given join{[5:29](-), [37:52](-)} 1267 and the translation would have wrongly been "EXAMPLE*SILLY" instead. 1268 1269 """ 1270 return CompoundLocation([loc._flip(length) for loc in self.parts], 1271 self.operator)
1272 1273 @property
1274 - def start(self):
1275 """Start location (integer like, possibly a fuzzy position, read only).""" 1276 return min(loc.start for loc in self.parts)
1277 1278 @property
1279 - def end(self):
1280 """End location (integer like, possibly a fuzzy position, read only).""" 1281 return max(loc.end for loc in self.parts)
1282 1283 @property
1284 - def nofuzzy_start(self):
1285 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1286 1287 This is an alias for int(feature.start), which should be used in 1288 preference -- unless you are trying to support old versions of 1289 Biopython. 1290 """ 1291 try: 1292 return int(self.start) 1293 except TypeError: 1294 if isinstance(self.start, UnknownPosition): 1295 return None 1296 raise
1297 1298 @property
1299 - def nofuzzy_end(self):
1300 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1301 1302 This is an alias for int(feature.end), which should be used in 1303 preference -- unless you are trying to support old versions of 1304 Biopython. 1305 """ 1306 try: 1307 return int(self.end) 1308 except TypeError: 1309 if isinstance(self.end, UnknownPosition): 1310 return None 1311 raise
1312 1313 @property
1314 - def ref(self):
1315 """CompoundLocation's don't have a ref (dummy method for API compatibility).""" 1316 return None
1317 1318 @property
1319 - def ref_db(self):
1320 """CompoundLocation's don't have a ref_db (dummy method for API compatibility).""" 1321 return None
1322
1323 - def extract(self, parent_sequence):
1324 """Extract feature sequence from the supplied parent sequence.""" 1325 # This copes with mixed strand features & all on reverse: 1326 parts = [loc.extract(parent_sequence) for loc in self.parts] 1327 # We use addition rather than a join to avoid alphabet issues: 1328 f_seq = parts[0] 1329 for part in parts[1:]: 1330 f_seq += part 1331 return f_seq
1332
1333 1334 -class AbstractPosition(object):
1335 """Abstract base class representing a position.""" 1336
1337 - def __repr__(self):
1338 """String representation of the location for debugging.""" 1339 return "%s(...)" % (self.__class__.__name__)
1340
1341 1342 -class ExactPosition(int, AbstractPosition):
1343 """Specify the specific position of a boundary. 1344 1345 o position - The position of the boundary. 1346 o extension - An optional argument which must be zero since we don't 1347 have an extension. The argument is provided so that the same number of 1348 arguments can be passed to all position types. 1349 1350 In this case, there is no fuzziness associated with the position. 1351 1352 >>> p = ExactPosition(5) 1353 >>> p 1354 ExactPosition(5) 1355 >>> print(p) 1356 5 1357 1358 >>> isinstance(p, AbstractPosition) 1359 True 1360 >>> isinstance(p, int) 1361 True 1362 1363 Integer comparisons and operations should work as expected: 1364 1365 >>> p == 5 1366 True 1367 >>> p < 6 1368 True 1369 >>> p <= 5 1370 True 1371 >>> p + 10 1372 15 1373 1374 """
1375 - def __new__(cls, position, extension=0):
1376 if extension != 0: 1377 raise AttributeError("Non-zero extension %s for exact position." 1378 % extension) 1379 return int.__new__(cls, position)
1380
1381 - def __repr__(self):
1382 """String representation of the ExactPosition location for debugging.""" 1383 return "%s(%i)" % (self.__class__.__name__, int(self))
1384 1385 @property
1386 - def position(self):
1387 """Legacy attribute to get position as integer (OBSOLETE).""" 1388 return int(self)
1389 1390 @property
1391 - def extension(self):
1392 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1393 return 0
1394
1395 - def _shift(self, offset):
1396 # By default preserve any subclass 1397 return self.__class__(int(self) + offset)
1398
1399 - def _flip(self, length):
1400 # By default perserve any subclass 1401 return self.__class__(length - int(self))
1402
1403 1404 -class UncertainPosition(ExactPosition):
1405 """Specify a specific position which is uncertain. 1406 1407 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the 1408 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. 1409 """ 1410 pass
1411
1412 1413 -class UnknownPosition(AbstractPosition):
1414 """Specify a specific position which is unknown (has no position). 1415 1416 This is used in UniProt, e.g. ? or in the XML as unknown. 1417 """ 1418
1419 - def __repr__(self):
1420 """String representation of the UnknownPosition location for debugging.""" 1421 return "%s()" % self.__class__.__name__
1422
1423 - def __hash__(self):
1424 return hash(None)
1425 1426 @property
1427 - def position(self):
1428 """Legacy attribute to get position (None) (OBSOLETE).""" 1429 return None
1430 1431 @property
1432 - def extension(self):
1433 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1434 return 0
1435
1436 - def _shift(self, offset):
1437 return self
1438
1439 - def _flip(self, length):
1440 return self
1441
1442 1443 -class WithinPosition(int, AbstractPosition):
1444 """Specify the position of a boundary within some coordinates. 1445 1446 Arguments: 1447 o position - The default integer position 1448 o left - The start (left) position of the boundary 1449 o right - The end (right) position of the boundary 1450 1451 This allows dealing with a position like ((1.4)..100). This 1452 indicates that the start of the sequence is somewhere between 1 1453 and 4. Since this is a start coordinate, it should acts like 1454 it is at position 1 (or in Python counting, 0). 1455 1456 >>> p = WithinPosition(10, 10, 13) 1457 >>> p 1458 WithinPosition(10, left=10, right=13) 1459 >>> print(p) 1460 (10.13) 1461 >>> int(p) 1462 10 1463 1464 Basic integer comparisons and operations should work as though 1465 this were a plain integer: 1466 1467 >>> p == 10 1468 True 1469 >>> p in [9, 10, 11] 1470 True 1471 >>> p < 11 1472 True 1473 >>> p + 10 1474 20 1475 1476 >>> isinstance(p, WithinPosition) 1477 True 1478 >>> isinstance(p, AbstractPosition) 1479 True 1480 >>> isinstance(p, int) 1481 True 1482 1483 Note this also applies for comparison to other position objects, 1484 where again the integer behaviour is used: 1485 1486 >>> p == 10 1487 True 1488 >>> p == ExactPosition(10) 1489 True 1490 >>> p == BeforePosition(10) 1491 True 1492 >>> p == AfterPosition(10) 1493 True 1494 1495 If this were an end point, you would want the position to be 13: 1496 1497 >>> p2 = WithinPosition(13, 10, 13) 1498 >>> p2 1499 WithinPosition(13, left=10, right=13) 1500 >>> print(p2) 1501 (10.13) 1502 >>> int(p2) 1503 13 1504 >>> p2 == 13 1505 True 1506 >>> p2 == ExactPosition(13) 1507 True 1508 1509 The old legacy properties of position and extension give the 1510 starting/lower/left position as an integer, and the distance 1511 to the ending/higher/right position as an integer. Note that 1512 the position object will act like either the left or the right 1513 end-point depending on how it was created: 1514 1515 >>> p.position == p2.position == 10 1516 True 1517 >>> p.extension == p2.extension == 3 1518 True 1519 >>> int(p) == int(p2) 1520 False 1521 >>> p == 10 1522 True 1523 >>> p2 == 13 1524 True 1525 1526 """
1527 - def __new__(cls, position, left, right):
1528 assert position == left or position == right, \ 1529 "WithinPosition: %r should match left %r or right %r" \ 1530 % (position, left, right) 1531 obj = int.__new__(cls, position) 1532 obj._left = left 1533 obj._right = right 1534 return obj
1535
1536 - def __repr__(self):
1537 """String representation of the WithinPosition location for debugging.""" 1538 return "%s(%i, left=%i, right=%i)" \ 1539 % (self.__class__.__name__, int(self), 1540 self._left, self._right)
1541
1542 - def __str__(self):
1543 return "(%s.%s)" % (self._left, self._right)
1544 1545 @property
1546 - def position(self):
1547 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1548 return self._left
1549 1550 @property
1551 - def extension(self):
1552 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1553 return self._right - self._left
1554
1555 - def _shift(self, offset):
1556 return self.__class__(int(self) + offset, 1557 self._left + offset, 1558 self._right + offset)
1559
1560 - def _flip(self, length):
1561 return self.__class__(length - int(self), 1562 length - self._right, 1563 length - self._left)
1564
1565 1566 -class BetweenPosition(int, AbstractPosition):
1567 """Specify the position of a boundary between two coordinates (OBSOLETE?). 1568 1569 Arguments: 1570 o position - The default integer position 1571 o left - The start (left) position of the boundary 1572 o right - The end (right) position of the boundary 1573 1574 This allows dealing with a position like 123^456. This 1575 indicates that the start of the sequence is somewhere between 1576 123 and 456. It is up to the parser to set the position argument 1577 to either boundary point (depending on if this is being used as 1578 a start or end of the feature). For example as a feature end: 1579 1580 >>> p = BetweenPosition(456, 123, 456) 1581 >>> p 1582 BetweenPosition(456, left=123, right=456) 1583 >>> print(p) 1584 (123^456) 1585 >>> int(p) 1586 456 1587 1588 Integer equality and comparison use the given position, 1589 1590 >>> p == 456 1591 True 1592 >>> p in [455, 456, 457] 1593 True 1594 >>> p > 300 1595 True 1596 1597 The old legacy properties of position and extension give the 1598 starting/lower/left position as an integer, and the distance 1599 to the ending/higher/right position as an integer. Note that 1600 the position object will act like either the left or the right 1601 end-point depending on how it was created: 1602 1603 >>> p2 = BetweenPosition(123, left=123, right=456) 1604 >>> p.position == p2.position == 123 1605 True 1606 >>> p.extension 1607 333 1608 >>> p2.extension 1609 333 1610 >>> p.extension == p2.extension == 333 1611 True 1612 >>> int(p) == int(p2) 1613 False 1614 >>> p == 456 1615 True 1616 >>> p2 == 123 1617 True 1618 1619 Note this potentially surprising behaviour: 1620 1621 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) 1622 True 1623 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) 1624 True 1625 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) 1626 True 1627 1628 i.e. For equality (and sorting) the position objects behave like 1629 integers. 1630 """
1631 - def __new__(cls, position, left, right):
1632 assert position == left or position == right 1633 obj = int.__new__(cls, position) 1634 obj._left = left 1635 obj._right = right 1636 return obj
1637
1638 - def __repr__(self):
1639 """String representation of the WithinPosition location for debugging.""" 1640 return "%s(%i, left=%i, right=%i)" \ 1641 % (self.__class__.__name__, int(self), 1642 self._left, self._right)
1643
1644 - def __str__(self):
1645 return "(%s^%s)" % (self._left, self._right)
1646 1647 @property
1648 - def position(self):
1649 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1650 return self._left
1651 1652 @property
1653 - def extension(self):
1654 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1655 return self._right - self._left
1656
1657 - def _shift(self, offset):
1658 return self.__class__(int(self) + offset, 1659 self._left + offset, 1660 self._right + offset)
1661
1662 - def _flip(self, length):
1663 return self.__class__(length - int(self), 1664 length - self._right, 1665 length - self._left)
1666
1667 1668 -class BeforePosition(int, AbstractPosition):
1669 """Specify a position where the actual location occurs before it. 1670 1671 Arguments: 1672 o position - The upper boundary of where the location can occur. 1673 o extension - An optional argument which must be zero since we don't 1674 have an extension. The argument is provided so that the same number of 1675 arguments can be passed to all position types. 1676 1677 This is used to specify positions like (<10..100) where the location 1678 occurs somewhere before position 10. 1679 1680 >>> p = BeforePosition(5) 1681 >>> p 1682 BeforePosition(5) 1683 >>> print(p) 1684 <5 1685 >>> int(p) 1686 5 1687 >>> p + 10 1688 15 1689 1690 Note this potentially surprising behaviour: 1691 1692 >>> p == ExactPosition(5) 1693 True 1694 >>> p == AfterPosition(5) 1695 True 1696 1697 Just remember that for equality and sorting the position objects act 1698 like integers. 1699 """ 1700 # Subclasses int so can't use __init__
1701 - def __new__(cls, position, extension=0):
1702 if extension != 0: 1703 raise AttributeError("Non-zero extension %s for exact position." 1704 % extension) 1705 return int.__new__(cls, position)
1706 1707 @property
1708 - def position(self):
1709 """Legacy attribute to get position as integer (OBSOLETE).""" 1710 return int(self)
1711 1712 @property
1713 - def extension(self):
1714 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1715 return 0
1716
1717 - def __repr__(self):
1718 """A string representation of the location for debugging.""" 1719 return "%s(%i)" % (self.__class__.__name__, int(self))
1720
1721 - def __str__(self):
1722 return "<%s" % self.position
1723
1724 - def _shift(self, offset):
1725 return self.__class__(int(self) + offset)
1726
1727 - def _flip(self, length):
1728 return AfterPosition(length - int(self))
1729
1730 1731 -class AfterPosition(int, AbstractPosition):
1732 """Specify a position where the actual location is found after it. 1733 1734 Arguments: 1735 o position - The lower boundary of where the location can occur. 1736 o extension - An optional argument which must be zero since we don't 1737 have an extension. The argument is provided so that the same number of 1738 arguments can be passed to all position types. 1739 1740 This is used to specify positions like (>10..100) where the location 1741 occurs somewhere after position 10. 1742 1743 >>> p = AfterPosition(7) 1744 >>> p 1745 AfterPosition(7) 1746 >>> print(p) 1747 >7 1748 >>> int(p) 1749 7 1750 >>> p + 10 1751 17 1752 1753 >>> isinstance(p, AfterPosition) 1754 True 1755 >>> isinstance(p, AbstractPosition) 1756 True 1757 >>> isinstance(p, int) 1758 True 1759 1760 Note this potentially surprising behaviour: 1761 1762 >>> p == ExactPosition(7) 1763 True 1764 >>> p == BeforePosition(7) 1765 True 1766 1767 Just remember that for equality and sorting the position objects act 1768 like integers. 1769 """ 1770 # Subclasses int so can't use __init__
1771 - def __new__(cls, position, extension=0):
1772 if extension != 0: 1773 raise AttributeError("Non-zero extension %s for exact position." 1774 % extension) 1775 return int.__new__(cls, position)
1776 1777 @property
1778 - def position(self):
1779 """Legacy attribute to get position as integer (OBSOLETE).""" 1780 return int(self)
1781 1782 @property
1783 - def extension(self):
1784 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1785 return 0
1786
1787 - def __repr__(self):
1788 """A string representation of the location for debugging.""" 1789 return "%s(%i)" % (self.__class__.__name__, int(self))
1790
1791 - def __str__(self):
1792 return ">%s" % self.position
1793
1794 - def _shift(self, offset):
1795 return self.__class__(int(self) + offset)
1796
1797 - def _flip(self, length):
1798 return BeforePosition(length - int(self))
1799
1800 1801 -class OneOfPosition(int, AbstractPosition):
1802 """Specify a position where the location can be multiple positions. 1803 1804 This models the GenBank 'one-of(1888,1901)' function, and tries 1805 to make this fit within the Biopython Position models. If this was 1806 a start position it should act like 1888, but as an end position 1901. 1807 1808 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) 1809 >>> p 1810 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) 1811 >>> int(p) 1812 1888 1813 1814 Interget comparisons and operators act like using int(p), 1815 1816 >>> p == 1888 1817 True 1818 >>> p <= 1888 1819 True 1820 >>> p > 1888 1821 False 1822 >>> p + 100 1823 1988 1824 1825 >>> isinstance(p, OneOfPosition) 1826 True 1827 >>> isinstance(p, AbstractPosition) 1828 True 1829 >>> isinstance(p, int) 1830 True 1831 1832 The old legacy properties of position and extension give the 1833 starting/lowest/left-most position as an integer, and the 1834 distance to the ending/highest/right-most position as an integer. 1835 Note that the position object will act like one of the list of 1836 possible locations depending on how it was created: 1837 1838 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)]) 1839 >>> p.position == p2.position == 1888 1840 True 1841 >>> p.extension == p2.extension == 13 1842 True 1843 >>> int(p) == int(p2) 1844 False 1845 >>> p == 1888 1846 True 1847 >>> p2 == 1901 1848 True 1849 1850 """
1851 - def __new__(cls, position, choices):
1852 """Initialize with a set of posssible positions. 1853 1854 position_list is a list of AbstractPosition derived objects, 1855 specifying possible locations. 1856 1857 position is an integer specifying the default behaviour. 1858 """ 1859 assert position in choices, \ 1860 "OneOfPosition: %r should match one of %r" % (position, choices) 1861 obj = int.__new__(cls, position) 1862 obj.position_choices = choices 1863 return obj
1864 1865 @property
1866 - def position(self):
1867 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1868 return min(int(pos) for pos in self.position_choices)
1869 1870 @property
1871 - def extension(self):
1872 """Legacy attribute to get extension as integer (OBSOLETE).""" 1873 positions = [int(pos) for pos in self.position_choices] 1874 return max(positions) - min(positions)
1875
1876 - def __repr__(self):
1877 """String representation of the OneOfPosition location for debugging.""" 1878 return "%s(%i, choices=%r)" % (self.__class__.__name__, 1879 int(self), self.position_choices)
1880
1881 - def __str__(self):
1882 out = "one-of(" 1883 for position in self.position_choices: 1884 out += "%s," % position 1885 # replace the last comma with the closing parenthesis 1886 out = out[:-1] + ")" 1887 return out
1888
1889 - def _shift(self, offset):
1890 return self.__class__(int(self) + offset, 1891 [p._shift(offset) for p in self.position_choices])
1892
1893 - def _flip(self, length):
1894 return self.__class__(length - int(self), 1895 [p._flip(length) for p in self.position_choices[::-1]])
1896
1897 1898 -class PositionGap(object):
1899 """Simple class to hold information about a gap between positions.""" 1900
1901 - def __init__(self, gap_size):
1902 """Intialize with a position object containing the gap information. 1903 """ 1904 self.gap_size = gap_size
1905
1906 - def __repr__(self):
1907 """A string representation of the position gap for debugging.""" 1908 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
1909
1910 - def __str__(self):
1911 out = "gap(%s)" % self.gap_size 1912 return out
1913 1914 1915 if __name__ == "__main__": 1916 from Bio._utils import run_doctest 1917 run_doctest() 1918