Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

   1  # Copyright 2000-2003 Jeff Chang. 
   2  # Copyright 2001-2008 Brad Chapman. 
   3  # Copyright 2005-2016 by Peter Cock. 
   4  # Copyright 2006-2009 Michiel de Hoon. 
   5  # All rights reserved. 
   6  # 
   7  # This file is part of the Biopython distribution and governed by your 
   8  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
   9  # Please see the LICENSE file that should have been included as part of this 
  10  # package. 
  11  """Represent a Sequence Feature holding info about a part of a sequence. 
  12   
  13  This is heavily modeled after the Biocorba SeqFeature objects, and 
  14  may be pretty biased towards GenBank stuff since I'm writing it 
  15  for the GenBank parser output... 
  16   
  17  What's here: 
  18   
  19  Base class to hold a Feature 
  20  ---------------------------- 
  21   
  22  Classes: 
  23   - SeqFeature 
  24   
  25  Hold information about a Reference 
  26  ---------------------------------- 
  27   
  28  This is an attempt to create a General class to hold Reference type 
  29  information. 
  30   
  31  Classes: 
  32   - Reference 
  33   
  34  Specify locations of a feature on a Sequence 
  35  -------------------------------------------- 
  36   
  37  This aims to handle, in Ewan Birney's words, 'the dreaded fuzziness issue'. 
  38  This has the advantages of allowing us to handle fuzzy stuff in case anyone 
  39  needs it, and also be compatible with BioPerl etc and BioSQL. 
  40   
  41  Classes: 
  42   - FeatureLocation - Specify the start and end location of a feature. 
  43   - CompoundLocation - Collection of FeatureLocation objects (for joins etc). 
  44   - ExactPosition - Specify the position as being exact. 
  45   - WithinPosition - Specify a position occurring within some range. 
  46   - BetweenPosition - Specify a position occurring between a range (OBSOLETE?). 
  47   - BeforePosition - Specify the position as being found before some base. 
  48   - AfterPosition - Specify the position as being found after some base. 
  49   - OneOfPosition - Specify a position where the location can be multiple positions. 
  50   - UnknownPosition - Represents missing information like '?' in UniProt. 
  51   
  52  """ 
  53   
  54  from __future__ import print_function 
  55   
  56  from collections import OrderedDict 
  57   
  58  from Bio._py3k import _is_int_or_long 
  59   
  60  from Bio.Seq import MutableSeq, reverse_complement 
61 62 63 -class SeqFeature(object):
64 """Represent a Sequence Feature on an object. 65 66 Attributes: 67 - location - the location of the feature on the sequence (FeatureLocation) 68 - type - the specified type of the feature (ie. CDS, exon, repeat...) 69 - location_operator - a string specifying how this SeqFeature may 70 be related to others. For example, in the example GenBank feature 71 shown below, the location_operator would be "join". This is a proxy 72 for feature.location.operator and only applies to compound locations. 73 - strand - A value specifying on which strand (of a DNA sequence, for 74 instance) the feature deals with. 1 indicates the plus strand, -1 75 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), 76 while the default of None indicates that strand doesn't apply (dot in GFF3, 77 e.g. features on proteins). Note this is a shortcut for accessing the 78 strand property of the feature's location. 79 - id - A string identifier for the feature. 80 - ref - A reference to another sequence. This could be an accession 81 number for some different sequence. Note this is a shortcut for the 82 reference property of the feature's location. 83 - ref_db - A different database for the reference accession number. 84 Note this is a shortcut for the reference property of the location 85 - qualifiers - A dictionary of qualifiers on the feature. These are 86 analogous to the qualifiers from a GenBank feature table. The keys of 87 the dictionary are qualifier names, the values are the qualifier 88 values. As of Biopython 1.69 this is an ordered dictionary. 89 90 """ 91
92 - def __init__(self, location=None, type='', location_operator='', 93 strand=None, id="<unknown id>", 94 qualifiers=None, sub_features=None, 95 ref=None, ref_db=None):
96 """Initialize a SeqFeature on a Sequence. 97 98 location can either be a FeatureLocation (with strand argument also 99 given if required), or None. 100 101 e.g. With no strand, on the forward strand, and on the reverse strand: 102 103 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 104 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain") 105 >>> f1.strand == f1.location.strand == None 106 True 107 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS") 108 >>> f2.strand == f2.location.strand == +1 109 True 110 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS") 111 >>> f3.strand == f3.location.strand == -1 112 True 113 114 An invalid strand will trigger an exception: 115 116 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2) 117 Traceback (most recent call last): 118 ... 119 ValueError: Strand should be +1, -1, 0 or None, not 2 120 121 Similarly if set via the FeatureLocation directly: 122 123 >>> loc4 = FeatureLocation(50, 60, strand=2) 124 Traceback (most recent call last): 125 ... 126 ValueError: Strand should be +1, -1, 0 or None, not 2 127 128 For exact start/end positions, an integer can be used (as shown above) 129 as shorthand for the ExactPosition object. For non-exact locations, the 130 FeatureLocation must be specified via the appropriate position objects. 131 132 Note that the strand, ref and ref_db arguments to the SeqFeature are 133 now obsolete and will be deprecated in a future release (which will 134 give warning messages) and later removed. Set them via the location 135 object instead. 136 137 Note that location_operator and sub_features arguments can no longer 138 be used, instead do this via the CompoundLocation object. 139 """ 140 if location is not None and not isinstance(location, FeatureLocation) \ 141 and not isinstance(location, CompoundLocation): 142 raise TypeError( 143 "FeatureLocation, CompoundLocation (or None) required for the location") 144 self.location = location 145 self.type = type 146 if location_operator: 147 # TODO - Deprecation warning 148 self.location_operator = location_operator 149 if strand is not None: 150 # TODO - Deprecation warning 151 self.strand = strand 152 self.id = id 153 if qualifiers is None: 154 qualifiers = OrderedDict() 155 self.qualifiers = qualifiers 156 if sub_features is not None: 157 raise TypeError("Rather than sub_features, use a CompoundFeatureLocation") 158 if ref is not None: 159 # TODO - Deprecation warning 160 self.ref = ref 161 if ref_db is not None: 162 # TODO - Deprecation warning 163 self.ref_db = ref_db
164
165 - def _get_strand(self):
166 """Get function for the strand property (PRIVATE).""" 167 return self.location.strand
168
169 - def _set_strand(self, value):
170 """Set function for the strand property (PRIVATE).""" 171 try: 172 self.location.strand = value 173 except AttributeError: 174 if self.location is None: 175 if value is not None: 176 raise ValueError("Can't set strand without a location.") 177 else: 178 raise
179 180 strand = property(fget=_get_strand, fset=_set_strand, 181 doc="""Feature's strand 182 183 This is a shortcut for feature.location.strand 184 """) 185
186 - def _get_ref(self):
187 """Get function for the reference property (PRIVATE).""" 188 try: 189 return self.location.ref 190 except AttributeError: 191 return None
192
193 - def _set_ref(self, value):
194 """Set function for the reference property (PRIVATE).""" 195 try: 196 self.location.ref = value 197 except AttributeError: 198 if self.location is None: 199 if value is not None: 200 raise ValueError("Can't set ref without a location.") 201 else: 202 raise
203 ref = property(fget=_get_ref, fset=_set_ref, 204 doc="""Feature location reference (e.g. accession). 205 206 This is a shortcut for feature.location.ref 207 """) 208
209 - def _get_ref_db(self):
210 """Get function for the database reference property (PRIVATE).""" 211 try: 212 return self.location.ref_db 213 except AttributeError: 214 return None
215
216 - def _set_ref_db(self, value):
217 """Set function for the database reference property (PRIVATE).""" 218 self.location.ref_db = value
219 ref_db = property(fget=_get_ref_db, fset=_set_ref_db, 220 doc="""Feature location reference's database. 221 222 This is a shortcut for feature.location.ref_db 223 """) 224
225 - def _get_location_operator(self):
226 """Get function for the location operator property (PRIVATE).""" 227 try: 228 return self.location.operator 229 except AttributeError: 230 return None
231
232 - def _set_location_operator(self, value):
233 """Set function for the location operator property (PRIVATE).""" 234 if value: 235 if isinstance(self.location, CompoundLocation): 236 self.location.operator = value 237 elif self.location is None: 238 raise ValueError( 239 "Location is None so can't set its operator (to %r)" % value) 240 else: 241 raise ValueError( 242 "Only CompoundLocation gets an operator (%r)" % value)
243 location_operator = property(fget=_get_location_operator, fset=_set_location_operator, 244 doc="Location operator for compound locations (e.g. join).") 245
246 - def __repr__(self):
247 """Represent the feature as a string for debugging.""" 248 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 249 if self.type: 250 answer += ", type=%s" % repr(self.type) 251 if self.location_operator: 252 answer += ", location_operator=%s" % repr(self.location_operator) 253 if self.id and self.id != "<unknown id>": 254 answer += ", id=%s" % repr(self.id) 255 if self.ref: 256 answer += ", ref=%s" % repr(self.ref) 257 if self.ref_db: 258 answer += ", ref_db=%s" % repr(self.ref_db) 259 answer += ")" 260 return answer
261
262 - def __str__(self):
263 """Return the full feature as a python string.""" 264 out = "type: %s\n" % self.type 265 out += "location: %s\n" % self.location 266 if self.id and self.id != "<unknown id>": 267 out += "id: %s\n" % self.id 268 out += "qualifiers:\n" 269 for qual_key in sorted(self.qualifiers): 270 out += " Key: %s, Value: %s\n" % (qual_key, 271 self.qualifiers[qual_key]) 272 return out
273
274 - def _shift(self, offset):
275 """Return a copy of the feature with its location shifted (PRIVATE). 276 277 The annotation qaulifiers are copied. 278 """ 279 return SeqFeature(location=self.location._shift(offset), 280 type=self.type, 281 location_operator=self.location_operator, 282 id=self.id, 283 qualifiers=OrderedDict(self.qualifiers.items()))
284
285 - def _flip(self, length):
286 """Return a copy of the feature with its location flipped (PRIVATE). 287 288 The argument length gives the length of the parent sequence. For 289 example a location 0..20 (+1 strand) with parent length 30 becomes 290 after flipping 10..30 (-1 strand). Strandless (None) or unknown 291 strand (0) remain like that - just their end points are changed. 292 293 The annotation qaulifiers are copied. 294 """ 295 return SeqFeature(location=self.location._flip(length), 296 type=self.type, 297 location_operator=self.location_operator, 298 id=self.id, 299 qualifiers=OrderedDict(self.qualifiers.items()))
300
301 - def extract(self, parent_sequence):
302 """Extract the feature's sequence from supplied parent sequence. 303 304 The parent_sequence can be a Seq like object or a string, and will 305 generally return an object of the same type. The exception to this is 306 a MutableSeq as the parent sequence will return a Seq object. 307 308 This should cope with complex locations including complements, joins 309 and fuzzy positions. Even mixed strand features should work! This 310 also covers features on protein sequences (e.g. domains), although 311 here reverse strand features are not permitted. 312 313 >>> from Bio.Seq import Seq 314 >>> from Bio.Alphabet import generic_protein 315 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 316 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 317 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 318 >>> f.extract(seq) 319 Seq('VALIVIC', ProteinAlphabet()) 320 321 If the FeatureLocation is None, e.g. when parsing invalid locus 322 locations in the GenBank parser, extract() will raise a ValueError. 323 324 >>> from Bio.Seq import Seq 325 >>> from Bio.SeqFeature import SeqFeature 326 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 327 >>> f = SeqFeature(None, type="domain") 328 >>> f.extract(seq) 329 Traceback (most recent call last): 330 ... 331 ValueError: The feature's .location is None. Check the sequence file for a valid location. 332 333 Note - currently only compound features of type "join" are supported. 334 """ 335 if self.location is None: 336 raise ValueError("The feature's .location is None. Check the " 337 "sequence file for a valid location.") 338 return self.location.extract(parent_sequence)
339
340 - def translate(self, parent_sequence, table="Standard", start_offset=None, 341 stop_symbol="*", to_stop=False, cds=False, gap=None):
342 """Get a translation of the feature's sequence. 343 344 This method is intended for CDS or other features that code proteins 345 and is a shortcut that will both extract the feature and 346 translate it, taking into account the codon_start and transl_table 347 qualifiers, if they are present. If they are not present the 348 value of the arguments "table" and "start_offset" are used. 349 350 The arguments stop_symbol, to_stop, cds and gap have the same meaning 351 as Seq.translate, refer to that documentation for further information. 352 353 Arguments: 354 - parent_sequence - This method will translate DNA or RNA sequences, 355 and those with a nucleotide or generic alphabet. Trying to 356 translate a protein sequence raises an exception. 357 - table - Which codon table to use if there is no transl_table 358 qualifier for this feature. This can be either a name 359 (string), an NCBI identifier (integer), or a CodonTable 360 object (useful for non-standard genetic codes). This 361 defaults to the "Standard" table. 362 - start_offset - offset at which the first complete codon of a 363 coding feature can be found, relative to the first base of 364 that feature. Has a valid value of 0, 1 or 2. NOTE: this 365 uses python's 0-based numbering whereas the codon_start 366 qualifier in files from NCBI use 1-based numbering. 367 Will override a codon_start qualifier 368 369 >>> from Bio.Seq import Seq 370 >>> from Bio.Alphabet import generic_dna 371 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 372 >>> seq = Seq("GGTTACACTTACCGATAATGTCTCTGATGA", generic_dna) 373 >>> f = SeqFeature(FeatureLocation(0, 30), type="CDS") 374 >>> f.qualifiers['transl_table'] = [11] 375 >>> f.translate(seq) 376 Seq('GYTYR*CL**', HasStopCodon(ExtendedIUPACProtein(), '*')) 377 378 Now use the start_offset argument to change the frame. Note 379 this uses python 0-based numbering 380 381 >>> f.translate(seq, start_offset=1) 382 Seq('VTLTDNVSD', ExtendedIUPACProtein()) 383 384 Alternatively use the codon_start qualifier to do the same 385 thing. Note: this uses 1-based numbering, which is found 386 in files from NCBI 387 388 >>> f.qualifiers['codon_start'] = [2] 389 >>> f.translate(seq) 390 Seq('VTLTDNVSD', ExtendedIUPACProtein()) 391 """ 392 # see if this feature should be translated in a different 393 # frame using the "codon_start" qualifier 394 if start_offset is None: 395 try: 396 start_offset = int(self.qualifiers["codon_start"][0]) - 1 397 except KeyError: 398 start_offset = 0 399 400 if start_offset not in [0, 1, 2]: 401 raise ValueError("The start_offset must be 0, 1, or 2. " 402 "The supplied value is {}. Check the value " 403 "of either the codon_start qualifier or " 404 "the start_offset argument".format(start_offset)) 405 406 feat_seq = self.extract(parent_sequence)[start_offset:] 407 codon_table = self.qualifiers.get("transl_table", [table])[0] 408 409 return feat_seq.translate(table=codon_table, stop_symbol=stop_symbol, 410 to_stop=to_stop, cds=cds, gap=gap)
411 412 # Python 3:
413 - def __bool__(self):
414 """Boolean value of an instance of this class (True). 415 416 This behaviour is for backwards compatibility, since until the 417 __len__ method was added, a SeqFeature always evaluated as True. 418 419 Note that in comparison, Seq objects, strings, lists, etc, will all 420 evaluate to False if they have length zero. 421 422 WARNING: The SeqFeature may in future evaluate to False when its 423 length is zero (in order to better match normal python behaviour)! 424 """ 425 return True
426 427 # Python 2: 428 __nonzero__ = __bool__ 429
430 - def __len__(self):
431 """Return the length of the region where the feature is located. 432 433 >>> from Bio.Seq import Seq 434 >>> from Bio.Alphabet import generic_protein 435 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 436 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 437 >>> f = SeqFeature(FeatureLocation(8, 15), type="domain") 438 >>> len(f) 439 7 440 >>> f.extract(seq) 441 Seq('VALIVIC', ProteinAlphabet()) 442 >>> len(f.extract(seq)) 443 7 444 445 This is a proxy for taking the length of the feature's location: 446 447 >>> len(f.location) 448 7 449 450 For simple features this is the same as the region spanned (end 451 position minus start position using Pythonic counting). However, for 452 a compound location (e.g. a CDS as the join of several exons) the 453 gaps are not counted (e.g. introns). This ensures that len(f) matches 454 len(f.extract(parent_seq)), and also makes sure things work properly 455 with features wrapping the origin etc. 456 """ 457 return len(self.location)
458
459 - def __iter__(self):
460 """Iterate over the parent positions within the feature. 461 462 The iteration order is strand aware, and can be thought of as moving 463 along the feature using the parent sequence coordinates: 464 465 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 466 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 467 >>> len(f) 468 5 469 >>> for i in f: print(i) 470 9 471 8 472 7 473 6 474 5 475 >>> list(f) 476 [9, 8, 7, 6, 5] 477 478 This is a proxy for iterating over the location, 479 480 >>> list(f.location) 481 [9, 8, 7, 6, 5] 482 """ 483 return iter(self.location)
484
485 - def __contains__(self, value):
486 """Check if an integer position is within the feature. 487 488 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 489 >>> f = SeqFeature(FeatureLocation(5, 10), type="domain", strand=-1) 490 >>> len(f) 491 5 492 >>> [i for i in range(15) if i in f] 493 [5, 6, 7, 8, 9] 494 495 For example, to see which features include a SNP position, you could 496 use this: 497 498 >>> from Bio import SeqIO 499 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") 500 >>> for f in record.features: 501 ... if 1750 in f: 502 ... print("%s %s" % (f.type, f.location)) 503 source [0:154478](+) 504 gene [1716:4347](-) 505 tRNA join{[4310:4347](-), [1716:1751](-)} 506 507 Note that for a feature defined as a join of several subfeatures (e.g. 508 the union of several exons) the gaps are not checked (e.g. introns). 509 In this example, the tRNA location is defined in the GenBank file as 510 complement(join(1717..1751,4311..4347)), so that position 1760 falls 511 in the gap: 512 513 >>> for f in record.features: 514 ... if 1760 in f: 515 ... print("%s %s" % (f.type, f.location)) 516 source [0:154478](+) 517 gene [1716:4347](-) 518 519 Note that additional care may be required with fuzzy locations, for 520 example just before a BeforePosition: 521 522 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 523 >>> from Bio.SeqFeature import BeforePosition 524 >>> f = SeqFeature(FeatureLocation(BeforePosition(3), 8), type="domain") 525 >>> len(f) 526 5 527 >>> [i for i in range(10) if i in f] 528 [3, 4, 5, 6, 7] 529 530 Note that is is a proxy for testing membership on the location. 531 532 >>> [i for i in range(10) if i in f.location] 533 [3, 4, 5, 6, 7] 534 """ 535 return value in self.location
536
537 538 # --- References 539 540 541 # TODO -- Will this hold PubMed and Medline information decently? 542 -class Reference(object):
543 """Represent a Generic Reference object. 544 545 Attributes: 546 - location - A list of Location objects specifying regions of 547 the sequence that the references correspond to. If no locations are 548 specified, the entire sequence is assumed. 549 - authors - A big old string, or a list split by author, of authors 550 for the reference. 551 - title - The title of the reference. 552 - journal - Journal the reference was published in. 553 - medline_id - A medline reference for the article. 554 - pubmed_id - A pubmed reference for the article. 555 - comment - A place to stick any comments about the reference. 556 557 """ 558
559 - def __init__(self):
560 """Initialize the class.""" 561 self.location = [] 562 self.authors = '' 563 self.consrtm = '' 564 self.title = '' 565 self.journal = '' 566 self.medline_id = '' 567 self.pubmed_id = '' 568 self.comment = ''
569
570 - def __str__(self):
571 """Return the full Reference object as a python string.""" 572 out = "" 573 for single_location in self.location: 574 out += "location: %s\n" % single_location 575 out += "authors: %s\n" % self.authors 576 if self.consrtm: 577 out += "consrtm: %s\n" % self.consrtm 578 out += "title: %s\n" % self.title 579 out += "journal: %s\n" % self.journal 580 out += "medline id: %s\n" % self.medline_id 581 out += "pubmed id: %s\n" % self.pubmed_id 582 out += "comment: %s\n" % self.comment 583 return out
584
585 - def __repr__(self):
586 """Represent the Reference object as a string for debugging.""" 587 # TODO - Update this is __init__ later accpets values 588 return "%s(title=%s, ...)" % (self.__class__.__name__, 589 repr(self.title))
590
591 - def __eq__(self, other):
592 """Check if two Reference objects should be considered equal. 593 594 Note prior to Biopython 1.70 the location was not compared, as 595 until then __eq__ for the FeatureLocation class was not defined. 596 """ 597 return self.authors == other.authors and \ 598 self.consrtm == other.consrtm and \ 599 self.title == other.title and \ 600 self.journal == other.journal and \ 601 self.medline_id == other.medline_id and \ 602 self.pubmed_id == other.pubmed_id and \ 603 self.comment == other.comment and \ 604 self.location == other.location
605
606 - def __ne__(self, other):
607 """Implement the not-equal operand.""" 608 # This is needed for py2, but not for py3. 609 return not self == other
610
611 612 # --- Handling feature locations 613 614 -class FeatureLocation(object):
615 """Specify the location of a feature along a sequence. 616 617 The FeatureLocation is used for simple continuous features, which can 618 be described as running from a start position to and end position 619 (optionally with a strand and reference information). More complex 620 locations made up from several non-continuous parts (e.g. a coding 621 sequence made up of several exons) are described using a SeqFeature 622 with a CompoundLocation. 623 624 Note that the start and end location numbering follow Python's scheme, 625 thus a GenBank entry of 123..150 (one based counting) becomes a location 626 of [122:150] (zero based counting). 627 628 >>> from Bio.SeqFeature import FeatureLocation 629 >>> f = FeatureLocation(122, 150) 630 >>> print(f) 631 [122:150] 632 >>> print(f.start) 633 122 634 >>> print(f.end) 635 150 636 >>> print(f.strand) 637 None 638 639 Note the strand defaults to None. If you are working with nucleotide 640 sequences you'd want to be explicit if it is the forward strand: 641 642 >>> from Bio.SeqFeature import FeatureLocation 643 >>> f = FeatureLocation(122, 150, strand=+1) 644 >>> print(f) 645 [122:150](+) 646 >>> print(f.strand) 647 1 648 649 Note that for a parent sequence of length n, the FeatureLocation 650 start and end must satisfy the inequality 0 <= start <= end <= n. 651 This means even for features on the reverse strand of a nucleotide 652 sequence, we expect the 'start' coordinate to be less than the 653 'end'. 654 655 >>> from Bio.SeqFeature import FeatureLocation 656 >>> r = FeatureLocation(122, 150, strand=-1) 657 >>> print(r) 658 [122:150](-) 659 >>> print(r.start) 660 122 661 >>> print(r.end) 662 150 663 >>> print(r.strand) 664 -1 665 666 i.e. Rather than thinking of the 'start' and 'end' biologically in a 667 strand aware manor, think of them as the 'left most' or 'minimum' 668 boundary, and the 'right most' or 'maximum' boundary of the region 669 being described. This is particularly important with compound 670 locations describing non-continuous regions. 671 672 In the example above we have used standard exact positions, but there 673 are also specialised position objects used to represent fuzzy positions 674 as well, for example a GenBank location like complement(<123..150) 675 would use a BeforePosition object for the start. 676 """ 677
678 - def __init__(self, start, end, strand=None, ref=None, ref_db=None):
679 """Initialize the class. 680 681 start and end arguments specify the values where the feature begins 682 and ends. These can either by any of the ``*Position`` objects that 683 inherit from AbstractPosition, or can just be integers specifying the 684 position. In the case of integers, the values are assumed to be 685 exact and are converted in ExactPosition arguments. This is meant 686 to make it easy to deal with non-fuzzy ends. 687 688 i.e. Short form: 689 690 >>> from Bio.SeqFeature import FeatureLocation 691 >>> loc = FeatureLocation(5, 10, strand=-1) 692 >>> print(loc) 693 [5:10](-) 694 695 Explicit form: 696 697 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 698 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1) 699 >>> print(loc) 700 [5:10](-) 701 702 Other fuzzy positions are used similarly, 703 704 >>> from Bio.SeqFeature import FeatureLocation 705 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 706 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1) 707 >>> print(loc2) 708 [<5:>10](-) 709 710 For nucleotide features you will also want to specify the strand, 711 use 1 for the forward (plus) strand, -1 for the reverse (negative) 712 strand, 0 for stranded but strand unknown (? in GFF3), or None for 713 when the strand does not apply (dot in GFF3), e.g. features on 714 proteins. 715 716 >>> loc = FeatureLocation(5, 10, strand=+1) 717 >>> print(loc) 718 [5:10](+) 719 >>> print(loc.strand) 720 1 721 722 Normally feature locations are given relative to the parent 723 sequence you are working with, but an explicit accession can 724 be given with the optional ref and db_ref strings: 725 726 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1) 727 >>> print(loc) 728 AL391218.9[105172:108462](+) 729 >>> print(loc.ref) 730 AL391218.9 731 732 """ 733 # TODO - Check 0 <= start <= end (<= length of reference) 734 if isinstance(start, AbstractPosition): 735 self._start = start 736 elif _is_int_or_long(start): 737 self._start = ExactPosition(start) 738 else: 739 raise TypeError("start=%r %s" % (start, type(start))) 740 if isinstance(end, AbstractPosition): 741 self._end = end 742 elif _is_int_or_long(end): 743 self._end = ExactPosition(end) 744 else: 745 raise TypeError("end=%r %s" % (end, type(end))) 746 if isinstance(self.start.position, int) and \ 747 isinstance(self.end.position, int) and self.start > self.end: 748 raise ValueError('End location ({}) must be greater than or equal ' 749 'to start location ({})'.format(self.end, 750 self.start)) 751 self.strand = strand 752 self.ref = ref 753 self.ref_db = ref_db
754
755 - def _get_strand(self):
756 """Get function for the strand property (PRIVATE).""" 757 return self._strand
758
759 - def _set_strand(self, value):
760 """Set function for the strand property (PRIVATE).""" 761 if value not in [+1, -1, 0, None]: 762 raise ValueError("Strand should be +1, -1, 0 or None, not %r" 763 % value) 764 self._strand = value
765 766 strand = property(fget=_get_strand, fset=_set_strand, 767 doc="Strand of the location (+1, -1, 0 or None).") 768
769 - def __str__(self):
770 """Return a representation of the FeatureLocation object (with python counting). 771 772 For the simple case this uses the python splicing syntax, [122:150] 773 (zero based counting) which GenBank would call 123..150 (one based 774 counting). 775 """ 776 answer = "[%s:%s]" % (self._start, self._end) 777 if self.ref and self.ref_db: 778 answer = "%s:%s%s" % (self.ref_db, self.ref, answer) 779 elif self.ref: 780 answer = self.ref + answer 781 # Is ref_db without ref meaningful? 782 if self.strand is None: 783 return answer 784 elif self.strand == +1: 785 return answer + "(+)" 786 elif self.strand == -1: 787 return answer + "(-)" 788 else: 789 # strand = 0, stranded but strand unknown, ? in GFF3 790 return answer + "(?)"
791
792 - def __repr__(self):
793 """Represent the FeatureLocation object as a string for debugging.""" 794 optional = "" 795 if self.strand is not None: 796 optional += ", strand=%r" % self.strand 797 if self.ref is not None: 798 optional += ", ref=%r" % self.ref 799 if self.ref_db is not None: 800 optional += ", ref_db=%r" % self.ref_db 801 return "%s(%r, %r%s)" \ 802 % (self.__class__.__name__, self.start, self.end, optional)
803
804 - def __add__(self, other):
805 """Combine location with another FeatureLocation object, or shift it. 806 807 You can add two feature locations to make a join CompoundLocation: 808 809 >>> from Bio.SeqFeature import FeatureLocation 810 >>> f1 = FeatureLocation(5, 10) 811 >>> f2 = FeatureLocation(20, 30) 812 >>> combined = f1 + f2 813 >>> print(combined) 814 join{[5:10], [20:30]} 815 816 This is thus equivalent to: 817 818 >>> from Bio.SeqFeature import CompoundLocation 819 >>> join = CompoundLocation([f1, f2]) 820 >>> print(join) 821 join{[5:10], [20:30]} 822 823 You can also use sum(...) in this way: 824 825 >>> join = sum([f1, f2]) 826 >>> print(join) 827 join{[5:10], [20:30]} 828 829 Furthermore, you can combine a FeatureLocation with a CompoundLocation 830 in this way. 831 832 Separately, adding an integer will give a new FeatureLocation with 833 its start and end offset by that amount. For example: 834 835 >>> print(f1) 836 [5:10] 837 >>> print(f1 + 100) 838 [105:110] 839 >>> print(200 + f1) 840 [205:210] 841 842 This can be useful when editing annotation. 843 """ 844 if isinstance(other, FeatureLocation): 845 return CompoundLocation([self, other]) 846 elif _is_int_or_long(other): 847 return self._shift(other) 848 else: 849 # This will allow CompoundLocation's __radd__ to be called: 850 return NotImplemented
851
852 - def __radd__(self, other):
853 """Add a feature locationanother FeatureLocation object to the left.""" 854 if _is_int_or_long(other): 855 return self._shift(other) 856 else: 857 return NotImplemented
858
859 - def __nonzero__(self):
860 """Return True regardless of the length of the feature. 861 862 This behaviour is for backwards compatibility, since until the 863 __len__ method was added, a FeatureLocation always evaluated as True. 864 865 Note that in comparison, Seq objects, strings, lists, etc, will all 866 evaluate to False if they have length zero. 867 868 WARNING: The FeatureLocation may in future evaluate to False when its 869 length is zero (in order to better match normal python behaviour)! 870 """ 871 return True
872
873 - def __len__(self):
874 """Return the length of the region described by the FeatureLocation object. 875 876 Note that extra care may be needed for fuzzy locations, e.g. 877 878 >>> from Bio.SeqFeature import FeatureLocation 879 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 880 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 881 >>> len(loc) 882 5 883 """ 884 return int(self._end) - int(self._start)
885
886 - def __contains__(self, value):
887 """Check if an integer position is within the FeatureLocation object. 888 889 Note that extra care may be needed for fuzzy locations, e.g. 890 891 >>> from Bio.SeqFeature import FeatureLocation 892 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 893 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 894 >>> len(loc) 895 5 896 >>> [i for i in range(15) if i in loc] 897 [5, 6, 7, 8, 9] 898 """ 899 if not _is_int_or_long(value): 900 raise ValueError("Currently we only support checking for integer " 901 "positions being within a FeatureLocation.") 902 if value < self._start or value >= self._end: 903 return False 904 else: 905 return True
906
907 - def __iter__(self):
908 """Iterate over the parent positions within the FeatureLocation object. 909 910 >>> from Bio.SeqFeature import FeatureLocation 911 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 912 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10)) 913 >>> len(loc) 914 5 915 >>> for i in loc: print(i) 916 5 917 6 918 7 919 8 920 9 921 >>> list(loc) 922 [5, 6, 7, 8, 9] 923 >>> [i for i in range(15) if i in loc] 924 [5, 6, 7, 8, 9] 925 926 Note this is strand aware: 927 928 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1) 929 >>> list(loc) 930 [9, 8, 7, 6, 5] 931 """ 932 if self.strand == -1: 933 for i in range(self._end - 1, self._start - 1, -1): 934 yield i 935 else: 936 for i in range(self._start, self._end): 937 yield i
938
939 - def __eq__(self, other):
940 """Implement equality by comparing all the location attributes.""" 941 if not isinstance(other, FeatureLocation): 942 return False 943 return self._start == other.start and \ 944 self._end == other.end and \ 945 self._strand == other.strand and \ 946 self.ref == other.ref and \ 947 self.ref_db == other.ref_db
948
949 - def __ne__(self, other):
950 """Implement the not-equal operand.""" 951 # This is needed for py2, but not for py3. 952 return not self == other
953
954 - def _shift(self, offset):
955 """Return a copy of the FeatureLocation shifted by an offset (PRIVATE).""" 956 # TODO - What if offset is a fuzzy position? 957 if self.ref or self.ref_db: 958 # TODO - Return self? 959 raise ValueError("Feature references another sequence.") 960 return FeatureLocation(start=self._start._shift(offset), 961 end=self._end._shift(offset), 962 strand=self.strand)
963
964 - def _flip(self, length):
965 """Return a copy of the location after the parent is reversed (PRIVATE).""" 966 if self.ref or self.ref_db: 967 # TODO - Return self? 968 raise ValueError("Feature references another sequence.") 969 # Note this will flip the start and end too! 970 if self.strand == +1: 971 flip_strand = -1 972 elif self.strand == -1: 973 flip_strand = +1 974 else: 975 # 0 or None 976 flip_strand = self.strand 977 return FeatureLocation(start=self._end._flip(length), 978 end=self._start._flip(length), 979 strand=flip_strand)
980 981 @property
982 - def parts(self):
983 """Read only list of sections (always one, the FeatureLocation object). 984 985 This is a convenience property allowing you to write code handling 986 both simple FeatureLocation objects (with one part) and more complex 987 CompoundLocation objects (with multiple parts) interchangeably. 988 """ 989 return [self]
990 991 @property
992 - def start(self):
993 """Start location - left most (minimum) value, regardless of strand. 994 995 Read only, returns an integer like position object, possibly a fuzzy 996 position. 997 """ 998 return self._start
999 1000 @property
1001 - def end(self):
1002 """End location - right most (maximum) value, regardless of strand. 1003 1004 Read only, returns an integer like position object, possibly a fuzzy 1005 position. 1006 """ 1007 return self._end
1008 1009 @property
1010 - def nofuzzy_start(self):
1011 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1012 1013 This is now an alias for int(feature.start), which should be 1014 used in preference -- unless you are trying to support old 1015 versions of Biopython. 1016 """ 1017 try: 1018 return int(self._start) 1019 except TypeError: 1020 if isinstance(self._start, UnknownPosition): 1021 return None 1022 raise
1023 1024 @property
1025 - def nofuzzy_end(self):
1026 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1027 1028 This is now an alias for int(feature.end), which should be 1029 used in preference -- unless you are trying to support old 1030 versions of Biopython. 1031 """ 1032 try: 1033 return int(self._end) 1034 except TypeError: 1035 if isinstance(self._end, UnknownPosition): 1036 return None 1037 raise
1038
1039 - def extract(self, parent_sequence):
1040 """Extract the sequence from supplied parent sequence using the FeatureLocation object. 1041 1042 The parent_sequence can be a Seq like object or a string, and will 1043 generally return an object of the same type. The exception to this is 1044 a MutableSeq as the parent sequence will return a Seq object. 1045 1046 >>> from Bio.Seq import Seq 1047 >>> from Bio.Alphabet import generic_protein 1048 >>> from Bio.SeqFeature import FeatureLocation 1049 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 1050 >>> feature_loc = FeatureLocation(8, 15) 1051 >>> feature_loc.extract(seq) 1052 Seq('VALIVIC', ProteinAlphabet()) 1053 1054 """ 1055 if self.ref or self.ref_db: 1056 # TODO - Take a dictionary as an optional argument? 1057 raise ValueError("Feature references another sequence.") 1058 if isinstance(parent_sequence, MutableSeq): 1059 # This avoids complications with reverse complements 1060 # (the MutableSeq reverse complement acts in situ) 1061 parent_sequence = parent_sequence.toseq() 1062 f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] 1063 if self.strand == -1: 1064 try: 1065 f_seq = f_seq.reverse_complement() 1066 except AttributeError: 1067 assert isinstance(f_seq, str) 1068 f_seq = reverse_complement(f_seq) 1069 return f_seq
1070
1071 1072 -class CompoundLocation(object):
1073 """For handling joins etc where a feature location has several parts.""" 1074
1075 - def __init__(self, parts, operator="join"):
1076 """Initialize the class. 1077 1078 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1079 >>> f1 = FeatureLocation(10, 40, strand=+1) 1080 >>> f2 = FeatureLocation(50, 59, strand=+1) 1081 >>> f = CompoundLocation([f1, f2]) 1082 >>> len(f) == len(f1) + len(f2) == 39 == len(list(f)) 1083 True 1084 >>> print(f.operator) 1085 join 1086 >>> 5 in f 1087 False 1088 >>> 15 in f 1089 True 1090 >>> f.strand 1091 1 1092 1093 Notice that the strand of the compound location is computed 1094 automatically - in the case of mixed strands on the sub-locations 1095 the overall strand is set to None. 1096 1097 >>> f = CompoundLocation([FeatureLocation(3, 6, strand=+1), 1098 ... FeatureLocation(10, 13, strand=-1)]) 1099 >>> print(f.strand) 1100 None 1101 >>> len(f) 1102 6 1103 >>> list(f) 1104 [3, 4, 5, 12, 11, 10] 1105 1106 The example above doing list(f) iterates over the coordinates within the 1107 feature. This allows you to use max and min on the location, to find the 1108 range covered: 1109 1110 >>> min(f) 1111 3 1112 >>> max(f) 1113 12 1114 1115 More generally, you can use the compound location's start and end which 1116 give the full range covered, 0 <= start <= end <= full sequence length. 1117 1118 >>> f.start == min(f) 1119 True 1120 >>> f.end == max(f) + 1 1121 True 1122 1123 This is consistent with the behaviour of the simple FeatureLocation for 1124 a single region, where again the 'start' and 'end' do not necessarily 1125 give the biological start and end, but rather the 'minimal' and 'maximal' 1126 coordinate boundaries. 1127 1128 Note that adding locations provides a more intuitive method of 1129 construction: 1130 1131 >>> f = FeatureLocation(3, 6, strand=+1) + FeatureLocation(10, 13, strand=-1) 1132 >>> len(f) 1133 6 1134 >>> list(f) 1135 [3, 4, 5, 12, 11, 10] 1136 """ 1137 self.operator = operator 1138 self.parts = list(parts) 1139 for loc in self.parts: 1140 if not isinstance(loc, FeatureLocation): 1141 raise ValueError("CompoundLocation should be given a list of " 1142 "FeatureLocation objects, not %s" % loc.__class__) 1143 if len(parts) < 2: 1144 raise ValueError( 1145 "CompoundLocation should have at least 2 parts, not %r" % parts)
1146
1147 - def __str__(self):
1148 """Return a representation of the CompoundLocation object (with python counting).""" 1149 return "%s{%s}" % (self.operator, ", ".join(str(loc) for loc in self.parts))
1150
1151 - def __repr__(self):
1152 """Represent the CompoundLocation object as string for debugging.""" 1153 return "%s(%r, %r)" % (self.__class__.__name__, 1154 self.parts, self.operator)
1155
1156 - def _get_strand(self):
1157 """Get function for the strand property (PRIVATE).""" 1158 # Historically a join on the reverse strand has been represented 1159 # in Biopython with both the parent SeqFeature and its children 1160 # (the exons for a CDS) all given a strand of -1. Likewise, for 1161 # a join feature on the forward strand they all have strand +1. 1162 # However, we must also consider evil mixed strand examples like 1163 # this, join(complement(69611..69724),139856..140087,140625..140650) 1164 if len(set(loc.strand for loc in self.parts)) == 1: 1165 return self.parts[0].strand 1166 else: 1167 return None # i.e. mixed strands
1168
1169 - def _set_strand(self, value):
1170 """Set function for the strand property (PRIVATE).""" 1171 # Should this be allowed/encouraged? 1172 for loc in self.parts: 1173 loc.strand = value
1174 strand = property(fget=_get_strand, fset=_set_strand, 1175 doc="""Overall strand of the compound location. 1176 1177 If all the parts have the same strand, that is returned. Otherwise 1178 for mixed strands, this returns None. 1179 1180 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1181 >>> f1 = FeatureLocation(15, 17, strand=1) 1182 >>> f2 = FeatureLocation(20, 30, strand=-1) 1183 >>> f = f1 + f2 1184 >>> f1.strand 1185 1 1186 >>> f2.strand 1187 -1 1188 >>> f.strand 1189 >>> f.strand is None 1190 True 1191 1192 If you set the strand of a CompoundLocation, this is applied to 1193 all the parts - use with caution: 1194 1195 >>> f.strand = 1 1196 >>> f1.strand 1197 1 1198 >>> f2.strand 1199 1 1200 >>> f.strand 1201 1 1202 1203 """) 1204
1205 - def __add__(self, other):
1206 """Combine locations, or shift the location by an integer offset. 1207 1208 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1209 >>> f1 = FeatureLocation(15, 17) + FeatureLocation(20, 30) 1210 >>> print(f1) 1211 join{[15:17], [20:30]} 1212 1213 You can add another FeatureLocation: 1214 1215 >>> print(f1 + FeatureLocation(40, 50)) 1216 join{[15:17], [20:30], [40:50]} 1217 >>> print(FeatureLocation(5, 10) + f1) 1218 join{[5:10], [15:17], [20:30]} 1219 1220 You can also add another CompoundLocation: 1221 1222 >>> f2 = FeatureLocation(40, 50) + FeatureLocation(60, 70) 1223 >>> print(f2) 1224 join{[40:50], [60:70]} 1225 >>> print(f1 + f2) 1226 join{[15:17], [20:30], [40:50], [60:70]} 1227 1228 Also, as with the FeatureLocation, adding an integer shifts the 1229 location's co-ordinates by that offset: 1230 1231 >>> print(f1 + 100) 1232 join{[115:117], [120:130]} 1233 >>> print(200 + f1) 1234 join{[215:217], [220:230]} 1235 >>> print(f1 + (-5)) 1236 join{[10:12], [15:25]} 1237 """ 1238 if isinstance(other, FeatureLocation): 1239 return CompoundLocation(self.parts + [other], self.operator) 1240 elif isinstance(other, CompoundLocation): 1241 if self.operator != other.operator: 1242 # Handle join+order -> order as a special case? 1243 raise ValueError("Mixed operators %s and %s" 1244 % (self.operator, other.operator)) 1245 return CompoundLocation(self.parts + other.parts, self.operator) 1246 elif _is_int_or_long(other): 1247 return self._shift(other) 1248 else: 1249 raise NotImplementedError
1250
1251 - def __radd__(self, other):
1252 """Add a feature to the left.""" 1253 if isinstance(other, FeatureLocation): 1254 return CompoundLocation([other] + self.parts, self.operator) 1255 elif _is_int_or_long(other): 1256 return self._shift(other) 1257 else: 1258 raise NotImplementedError
1259
1260 - def __contains__(self, value):
1261 """Check if an integer position is within the CompoundLocation object.""" 1262 for loc in self.parts: 1263 if value in loc: 1264 return True 1265 return False
1266
1267 - def __nonzero__(self):
1268 """Return True regardless of the length of the feature. 1269 1270 This behaviour is for backwards compatibility, since until the 1271 __len__ method was added, a FeatureLocation always evaluated as True. 1272 1273 Note that in comparison, Seq objects, strings, lists, etc, will all 1274 evaluate to False if they have length zero. 1275 1276 WARNING: The FeatureLocation may in future evaluate to False when its 1277 length is zero (in order to better match normal python behaviour)! 1278 """ 1279 return True
1280
1281 - def __len__(self):
1282 """Return the length of the CompoundLocation object.""" 1283 return sum(len(loc) for loc in self.parts)
1284
1285 - def __iter__(self):
1286 """Iterate over the parent positions within the CompoundLocation object.""" 1287 for loc in self.parts: 1288 for pos in loc: 1289 yield pos
1290
1291 - def __eq__(self, other):
1292 """Check if all parts of CompoundLocation are equal to all parts of other CompoundLocation.""" 1293 if not isinstance(other, CompoundLocation): 1294 return False 1295 if len(self.parts) != len(other.parts): 1296 return False 1297 if self.operator != other.operator: 1298 return False 1299 for self_part, other_part in zip(self.parts, other.parts): 1300 if self_part != other_part: 1301 return False 1302 return True
1303
1304 - def __ne__(self, other):
1305 """Implement the not-equal operand.""" 1306 # This is needed for py2, but not for py3. 1307 return not self == other
1308
1309 - def _shift(self, offset):
1310 """Return a copy of the CompoundLocation shifted by an offset (PRIVATE).""" 1311 return CompoundLocation([loc._shift(offset) for loc in self.parts], 1312 self.operator)
1313
1314 - def _flip(self, length):
1315 """Return a copy of the locations after the parent is reversed (PRIVATE). 1316 1317 Note that the order of the parts is NOT reversed too. Consider a CDS 1318 on the forward strand with exons small, medium and large (in length). 1319 Once we change the frame of reference to the reverse complement strand, 1320 the start codon is still part of the small exon, and the stop codon 1321 still part of the large exon - so the part order remains the same! 1322 1323 Here is an artificial example, were the features map to the two upper 1324 case regions and the lower case runs of n are not used: 1325 1326 >>> from Bio.Seq import Seq 1327 >>> from Bio.SeqFeature import FeatureLocation 1328 >>> dna = Seq("nnnnnAGCATCCTGCTGTACnnnnnnnnGAGAMTGCCATGCCCCTGGAGTGAnnnnn") 1329 >>> small = FeatureLocation(5, 20, strand=1) 1330 >>> large = FeatureLocation(28, 52, strand=1) 1331 >>> location = small + large 1332 >>> print(small) 1333 [5:20](+) 1334 >>> print(large) 1335 [28:52](+) 1336 >>> print(location) 1337 join{[5:20](+), [28:52](+)} 1338 >>> for part in location.parts: 1339 ... print(len(part)) 1340 ... 1341 15 1342 24 1343 1344 As you can see, this is a silly example where each "exon" is a word: 1345 1346 >>> print(small.extract(dna).translate()) 1347 SILLY 1348 >>> print(large.extract(dna).translate()) 1349 EXAMPLE* 1350 >>> print(location.extract(dna).translate()) 1351 SILLYEXAMPLE* 1352 >>> for part in location.parts: 1353 ... print(part.extract(dna).translate()) 1354 ... 1355 SILLY 1356 EXAMPLE* 1357 1358 Now, let's look at this from the reverse strand frame of reference: 1359 1360 >>> flipped_dna = dna.reverse_complement() 1361 >>> flipped_location = location._flip(len(dna)) 1362 >>> print(flipped_location.extract(flipped_dna).translate()) 1363 SILLYEXAMPLE* 1364 >>> for part in flipped_location.parts: 1365 ... print(part.extract(flipped_dna).translate()) 1366 ... 1367 SILLY 1368 EXAMPLE* 1369 1370 The key point here is the first part of the CompoundFeature is still the 1371 small exon, while the second part is still the large exon: 1372 1373 >>> for part in flipped_location.parts: 1374 ... print(len(part)) 1375 ... 1376 15 1377 24 1378 >>> print(flipped_location) 1379 join{[37:52](-), [5:29](-)} 1380 1381 Notice the parts are not reversed. However, there was a bug here in older 1382 versions of Biopython which would have given join{[5:29](-), [37:52](-)} 1383 and the translation would have wrongly been "EXAMPLE*SILLY" instead. 1384 1385 """ 1386 return CompoundLocation([loc._flip(length) for loc in self.parts], 1387 self.operator)
1388 1389 @property
1390 - def start(self):
1391 """Start location - left most (minimum) value, regardless of strand. 1392 1393 Read only, returns an integer like position object, possibly a fuzzy 1394 position. 1395 1396 For the special case of a CompoundLocation wrapping the origin of a 1397 circular genome, this will return zero. 1398 """ 1399 return min(loc.start for loc in self.parts)
1400 1401 @property
1402 - def end(self):
1403 """End location - right most (maximum) value, regardless of strand. 1404 1405 Read only, returns an integer like position object, possibly a fuzzy 1406 position. 1407 1408 For the special case of a CompoundLocation wrapping the origin of 1409 a circular genome this will match the genome length (minus one 1410 given how Python counts from zero). 1411 """ 1412 return max(loc.end for loc in self.parts)
1413 1414 @property
1415 - def nofuzzy_start(self):
1416 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 1417 1418 This is an alias for int(feature.start), which should be used in 1419 preference -- unless you are trying to support old versions of 1420 Biopython. 1421 """ 1422 try: 1423 return int(self.start) 1424 except TypeError: 1425 if isinstance(self.start, UnknownPosition): 1426 return None 1427 raise
1428 1429 @property
1430 - def nofuzzy_end(self):
1431 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 1432 1433 This is an alias for int(feature.end), which should be used in 1434 preference -- unless you are trying to support old versions of 1435 Biopython. 1436 """ 1437 try: 1438 return int(self.end) 1439 except TypeError: 1440 if isinstance(self.end, UnknownPosition): 1441 return None 1442 raise
1443 1444 @property
1445 - def ref(self):
1446 """Not present in CompoundLocation, dummy method for API compatibility.""" 1447 return None
1448 1449 @property
1450 - def ref_db(self):
1451 """Not present in CompoundLocation, dummy method for API compatibility.""" 1452 return None
1453
1454 - def extract(self, parent_sequence):
1455 """Extract the sequence from supplied parent sequence using the CompoundLocation object. 1456 1457 The parent_sequence can be a Seq like object or a string, and will 1458 generally return an object of the same type. The exception to this is 1459 a MutableSeq as the parent sequence will return a Seq object. 1460 1461 >>> from Bio.Seq import Seq 1462 >>> from Bio.Alphabet import generic_protein 1463 >>> from Bio.SeqFeature import FeatureLocation, CompoundLocation 1464 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 1465 >>> fl1 = FeatureLocation(2, 8) 1466 >>> fl2 = FeatureLocation(10, 15) 1467 >>> fl3 = CompoundLocation([fl1,fl2]) 1468 >>> fl3.extract(seq) 1469 Seq('QHKAMILIVIC', ProteinAlphabet()) 1470 1471 """ 1472 # This copes with mixed strand features & all on reverse: 1473 parts = [loc.extract(parent_sequence) for loc in self.parts] 1474 # We use addition rather than a join to avoid alphabet issues: 1475 f_seq = parts[0] 1476 for part in parts[1:]: 1477 f_seq += part 1478 return f_seq
1479
1480 1481 -class AbstractPosition(object):
1482 """Abstract base class representing a position.""" 1483
1484 - def __repr__(self):
1485 """Represent the AbstractPosition object as a string for debugging.""" 1486 return "%s(...)" % (self.__class__.__name__)
1487
1488 1489 -class ExactPosition(int, AbstractPosition):
1490 """Specify the specific position of a boundary. 1491 1492 Arguments: 1493 - position - The position of the boundary. 1494 - extension - An optional argument which must be zero since we don't 1495 have an extension. The argument is provided so that the same number 1496 of arguments can be passed to all position types. 1497 1498 In this case, there is no fuzziness associated with the position. 1499 1500 >>> p = ExactPosition(5) 1501 >>> p 1502 ExactPosition(5) 1503 >>> print(p) 1504 5 1505 1506 >>> isinstance(p, AbstractPosition) 1507 True 1508 >>> isinstance(p, int) 1509 True 1510 1511 Integer comparisons and operations should work as expected: 1512 1513 >>> p == 5 1514 True 1515 >>> p < 6 1516 True 1517 >>> p <= 5 1518 True 1519 >>> p + 10 1520 15 1521 1522 """ 1523
1524 - def __new__(cls, position, extension=0):
1525 """Create an ExactPosition object.""" 1526 if extension != 0: 1527 raise AttributeError("Non-zero extension %s for exact position." 1528 % extension) 1529 return int.__new__(cls, position)
1530
1531 - def __repr__(self):
1532 """Represent the ExactPosition object as a string for debugging.""" 1533 return "%s(%i)" % (self.__class__.__name__, int(self))
1534 1535 @property
1536 - def position(self):
1537 """Legacy attribute to get position as integer (OBSOLETE).""" 1538 return int(self)
1539 1540 @property
1541 - def extension(self):
1542 """Not present in this object, return zero (OBSOLETE).""" 1543 return 0
1544
1545 - def _shift(self, offset):
1546 """Return a copy of the position object with its location shifted (PRIVATE).""" 1547 # By default preserve any subclass 1548 return self.__class__(int(self) + offset)
1549
1550 - def _flip(self, length):
1551 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1552 # By default perserve any subclass 1553 return self.__class__(length - int(self))
1554
1555 1556 -class UncertainPosition(ExactPosition):
1557 """Specify a specific position which is uncertain. 1558 1559 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the 1560 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. 1561 """ 1562 1563 pass
1564
1565 1566 -class UnknownPosition(AbstractPosition):
1567 """Specify a specific position which is unknown (has no position). 1568 1569 This is used in UniProt, e.g. ? or in the XML as unknown. 1570 """ 1571
1572 - def __repr__(self):
1573 """Represent the UnknownPosition object as a string for debugging.""" 1574 return "%s()" % self.__class__.__name__
1575
1576 - def __hash__(self):
1577 """Return the hash value of the UnknownPosition object.""" 1578 return hash(None)
1579 1580 @property
1581 - def position(self):
1582 """Legacy attribute to get location (None) (OBSOLETE).""" 1583 return None
1584 1585 @property
1586 - def extension(self): # noqa: D402
1587 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1588 return 0
1589
1590 - def _shift(self, offset):
1591 """Return a copy of the position object with its location shifted (PRIVATE).""" 1592 return self
1593
1594 - def _flip(self, length):
1595 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1596 return self
1597
1598 1599 -class WithinPosition(int, AbstractPosition):
1600 """Specify the position of a boundary within some coordinates. 1601 1602 Arguments: 1603 - position - The default integer position 1604 - left - The start (left) position of the boundary 1605 - right - The end (right) position of the boundary 1606 1607 This allows dealing with a position like ((1.4)..100). This 1608 indicates that the start of the sequence is somewhere between 1 1609 and 4. Since this is a start coordinate, it should acts like 1610 it is at position 1 (or in Python counting, 0). 1611 1612 >>> p = WithinPosition(10, 10, 13) 1613 >>> p 1614 WithinPosition(10, left=10, right=13) 1615 >>> print(p) 1616 (10.13) 1617 >>> int(p) 1618 10 1619 1620 Basic integer comparisons and operations should work as though 1621 this were a plain integer: 1622 1623 >>> p == 10 1624 True 1625 >>> p in [9, 10, 11] 1626 True 1627 >>> p < 11 1628 True 1629 >>> p + 10 1630 20 1631 1632 >>> isinstance(p, WithinPosition) 1633 True 1634 >>> isinstance(p, AbstractPosition) 1635 True 1636 >>> isinstance(p, int) 1637 True 1638 1639 Note this also applies for comparison to other position objects, 1640 where again the integer behaviour is used: 1641 1642 >>> p == 10 1643 True 1644 >>> p == ExactPosition(10) 1645 True 1646 >>> p == BeforePosition(10) 1647 True 1648 >>> p == AfterPosition(10) 1649 True 1650 1651 If this were an end point, you would want the position to be 13: 1652 1653 >>> p2 = WithinPosition(13, 10, 13) 1654 >>> p2 1655 WithinPosition(13, left=10, right=13) 1656 >>> print(p2) 1657 (10.13) 1658 >>> int(p2) 1659 13 1660 >>> p2 == 13 1661 True 1662 >>> p2 == ExactPosition(13) 1663 True 1664 1665 The old legacy properties of position and extension give the 1666 starting/lower/left position as an integer, and the distance 1667 to the ending/higher/right position as an integer. Note that 1668 the position object will act like either the left or the right 1669 end-point depending on how it was created: 1670 1671 >>> p.position == p2.position == 10 1672 True 1673 >>> p.extension == p2.extension == 3 1674 True 1675 >>> int(p) == int(p2) 1676 False 1677 >>> p == 10 1678 True 1679 >>> p2 == 13 1680 True 1681 1682 """ 1683
1684 - def __new__(cls, position, left, right):
1685 """Create a WithinPosition object.""" 1686 assert position == left or position == right, \ 1687 "WithinPosition: %r should match left %r or right %r" \ 1688 % (position, left, right) 1689 obj = int.__new__(cls, position) 1690 obj._left = left 1691 obj._right = right 1692 return obj
1693
1694 - def __repr__(self):
1695 """Represent the WithinPosition object as a string for debugging.""" 1696 return "%s(%i, left=%i, right=%i)" \ 1697 % (self.__class__.__name__, int(self), 1698 self._left, self._right)
1699
1700 - def __str__(self):
1701 """Return a representation of the WithinPosition object (with python counting).""" 1702 return "(%s.%s)" % (self._left, self._right)
1703 1704 @property
1705 - def position(self):
1706 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1707 return self._left
1708 1709 @property
1710 - def extension(self): # noqa: D402
1711 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1712 return self._right - self._left
1713
1714 - def _shift(self, offset):
1715 """Return a copy of the position object with its location shifted (PRIVATE).""" 1716 return self.__class__(int(self) + offset, 1717 self._left + offset, 1718 self._right + offset)
1719
1720 - def _flip(self, length):
1721 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1722 return self.__class__(length - int(self), 1723 length - self._right, 1724 length - self._left)
1725
1726 1727 -class BetweenPosition(int, AbstractPosition):
1728 """Specify the position of a boundary between two coordinates (OBSOLETE?). 1729 1730 Arguments: 1731 - position - The default integer position 1732 - left - The start (left) position of the boundary 1733 - right - The end (right) position of the boundary 1734 1735 This allows dealing with a position like 123^456. This 1736 indicates that the start of the sequence is somewhere between 1737 123 and 456. It is up to the parser to set the position argument 1738 to either boundary point (depending on if this is being used as 1739 a start or end of the feature). For example as a feature end: 1740 1741 >>> p = BetweenPosition(456, 123, 456) 1742 >>> p 1743 BetweenPosition(456, left=123, right=456) 1744 >>> print(p) 1745 (123^456) 1746 >>> int(p) 1747 456 1748 1749 Integer equality and comparison use the given position, 1750 1751 >>> p == 456 1752 True 1753 >>> p in [455, 456, 457] 1754 True 1755 >>> p > 300 1756 True 1757 1758 The old legacy properties of position and extension give the 1759 starting/lower/left position as an integer, and the distance 1760 to the ending/higher/right position as an integer. Note that 1761 the position object will act like either the left or the right 1762 end-point depending on how it was created: 1763 1764 >>> p2 = BetweenPosition(123, left=123, right=456) 1765 >>> p.position == p2.position == 123 1766 True 1767 >>> p.extension 1768 333 1769 >>> p2.extension 1770 333 1771 >>> p.extension == p2.extension == 333 1772 True 1773 >>> int(p) == int(p2) 1774 False 1775 >>> p == 456 1776 True 1777 >>> p2 == 123 1778 True 1779 1780 Note this potentially surprising behaviour: 1781 1782 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) 1783 True 1784 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) 1785 True 1786 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) 1787 True 1788 1789 i.e. For equality (and sorting) the position objects behave like 1790 integers. 1791 """ 1792
1793 - def __new__(cls, position, left, right):
1794 """Create a new instance in BetweenPosition object.""" 1795 assert position == left or position == right 1796 obj = int.__new__(cls, position) 1797 obj._left = left 1798 obj._right = right 1799 return obj
1800
1801 - def __repr__(self):
1802 """Represent the BetweenPosition object as a string for debugging.""" 1803 return "%s(%i, left=%i, right=%i)" \ 1804 % (self.__class__.__name__, int(self), 1805 self._left, self._right)
1806
1807 - def __str__(self):
1808 """Return a representation of the BetweenPosition object (with python counting).""" 1809 return "(%s^%s)" % (self._left, self._right)
1810 1811 @property
1812 - def position(self):
1813 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1814 return self._left
1815 1816 @property
1817 - def extension(self): # noqa: D402
1818 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1819 return self._right - self._left
1820
1821 - def _shift(self, offset):
1822 """Return a copy of the position object with its location shifted (PRIVATE).""" 1823 return self.__class__(int(self) + offset, 1824 self._left + offset, 1825 self._right + offset)
1826
1827 - def _flip(self, length):
1828 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1829 return self.__class__(length - int(self), 1830 length - self._right, 1831 length - self._left)
1832
1833 1834 -class BeforePosition(int, AbstractPosition):
1835 """Specify a position where the actual location occurs before it. 1836 1837 Arguments: 1838 - position - The upper boundary of where the location can occur. 1839 - extension - An optional argument which must be zero since we don't 1840 have an extension. The argument is provided so that the same number 1841 of arguments can be passed to all position types. 1842 1843 This is used to specify positions like (<10..100) where the location 1844 occurs somewhere before position 10. 1845 1846 >>> p = BeforePosition(5) 1847 >>> p 1848 BeforePosition(5) 1849 >>> print(p) 1850 <5 1851 >>> int(p) 1852 5 1853 >>> p + 10 1854 15 1855 1856 Note this potentially surprising behaviour: 1857 1858 >>> p == ExactPosition(5) 1859 True 1860 >>> p == AfterPosition(5) 1861 True 1862 1863 Just remember that for equality and sorting the position objects act 1864 like integers. 1865 """ 1866 1867 # Subclasses int so can't use __init__
1868 - def __new__(cls, position, extension=0):
1869 """Create a new instance in BeforePosition object.""" 1870 if extension != 0: 1871 raise AttributeError("Non-zero extension %s for exact position." 1872 % extension) 1873 return int.__new__(cls, position)
1874 1875 @property
1876 - def position(self):
1877 """Legacy attribute to get position as integer (OBSOLETE).""" 1878 return int(self)
1879 1880 @property
1881 - def extension(self): # noqa: D402
1882 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1883 return 0
1884
1885 - def __repr__(self):
1886 """Represent the location as a string for debugging.""" 1887 return "%s(%i)" % (self.__class__.__name__, int(self))
1888
1889 - def __str__(self):
1890 """Return a representation of the BeforePosition object (with python counting).""" 1891 return "<%s" % self.position
1892
1893 - def _shift(self, offset):
1894 """Return a copy of the position object with its location shifted (PRIVATE).""" 1895 return self.__class__(int(self) + offset)
1896
1897 - def _flip(self, length):
1898 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1899 return AfterPosition(length - int(self))
1900
1901 1902 -class AfterPosition(int, AbstractPosition):
1903 """Specify a position where the actual location is found after it. 1904 1905 Arguments: 1906 - position - The lower boundary of where the location can occur. 1907 - extension - An optional argument which must be zero since we don't 1908 have an extension. The argument is provided so that the same number 1909 of arguments can be passed to all position types. 1910 1911 This is used to specify positions like (>10..100) where the location 1912 occurs somewhere after position 10. 1913 1914 >>> p = AfterPosition(7) 1915 >>> p 1916 AfterPosition(7) 1917 >>> print(p) 1918 >7 1919 >>> int(p) 1920 7 1921 >>> p + 10 1922 17 1923 1924 >>> isinstance(p, AfterPosition) 1925 True 1926 >>> isinstance(p, AbstractPosition) 1927 True 1928 >>> isinstance(p, int) 1929 True 1930 1931 Note this potentially surprising behaviour: 1932 1933 >>> p == ExactPosition(7) 1934 True 1935 >>> p == BeforePosition(7) 1936 True 1937 1938 Just remember that for equality and sorting the position objects act 1939 like integers. 1940 """ 1941 1942 # Subclasses int so can't use __init__
1943 - def __new__(cls, position, extension=0):
1944 """Create a new instance of the AfterPosition object.""" 1945 if extension != 0: 1946 raise AttributeError("Non-zero extension %s for exact position." 1947 % extension) 1948 return int.__new__(cls, position)
1949 1950 @property
1951 - def position(self):
1952 """Legacy attribute to get position as integer (OBSOLETE).""" 1953 return int(self)
1954 1955 @property
1956 - def extension(self): # noqa: D402
1957 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1958 return 0
1959
1960 - def __repr__(self):
1961 """Represent the location as a string for debugging.""" 1962 return "%s(%i)" % (self.__class__.__name__, int(self))
1963
1964 - def __str__(self):
1965 """Return a representation of the AfterPosition object (with python counting).""" 1966 return ">%s" % self.position
1967
1968 - def _shift(self, offset):
1969 """Return a copy of the position object with its location shifted (PRIVATE).""" 1970 return self.__class__(int(self) + offset)
1971
1972 - def _flip(self, length):
1973 """Return a copy of the location after the parent is reversed (PRIVATE).""" 1974 return BeforePosition(length - int(self))
1975
1976 1977 -class OneOfPosition(int, AbstractPosition):
1978 """Specify a position where the location can be multiple positions. 1979 1980 This models the GenBank 'one-of(1888,1901)' function, and tries 1981 to make this fit within the Biopython Position models. If this was 1982 a start position it should act like 1888, but as an end position 1901. 1983 1984 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) 1985 >>> p 1986 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) 1987 >>> int(p) 1988 1888 1989 1990 Interget comparisons and operators act like using int(p), 1991 1992 >>> p == 1888 1993 True 1994 >>> p <= 1888 1995 True 1996 >>> p > 1888 1997 False 1998 >>> p + 100 1999 1988 2000 2001 >>> isinstance(p, OneOfPosition) 2002 True 2003 >>> isinstance(p, AbstractPosition) 2004 True 2005 >>> isinstance(p, int) 2006 True 2007 2008 The old legacy properties of position and extension give the 2009 starting/lowest/left-most position as an integer, and the 2010 distance to the ending/highest/right-most position as an integer. 2011 Note that the position object will act like one of the list of 2012 possible locations depending on how it was created: 2013 2014 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)]) 2015 >>> p.position == p2.position == 1888 2016 True 2017 >>> p.extension == p2.extension == 13 2018 True 2019 >>> int(p) == int(p2) 2020 False 2021 >>> p == 1888 2022 True 2023 >>> p2 == 1901 2024 True 2025 2026 """ 2027
2028 - def __new__(cls, position, choices):
2029 """Initialize with a set of possible positions. 2030 2031 position_list is a list of AbstractPosition derived objects, 2032 specifying possible locations. 2033 2034 position is an integer specifying the default behaviour. 2035 """ 2036 assert position in choices, \ 2037 "OneOfPosition: %r should match one of %r" % (position, choices) 2038 obj = int.__new__(cls, position) 2039 obj.position_choices = choices 2040 return obj
2041 2042 @property
2043 - def position(self):
2044 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 2045 return min(int(pos) for pos in self.position_choices)
2046 2047 @property
2048 - def extension(self):
2049 """Legacy attribute to get extension as integer (OBSOLETE).""" 2050 positions = [int(pos) for pos in self.position_choices] 2051 return max(positions) - min(positions)
2052
2053 - def __repr__(self):
2054 """Represent the OneOfPosition object as a string for debugging.""" 2055 return "%s(%i, choices=%r)" % (self.__class__.__name__, 2056 int(self), self.position_choices)
2057
2058 - def __str__(self):
2059 """Return a representation of the OneOfPosition object (with python counting).""" 2060 out = "one-of(" 2061 for position in self.position_choices: 2062 out += "%s," % position 2063 # replace the last comma with the closing parenthesis 2064 out = out[:-1] + ")" 2065 return out
2066
2067 - def _shift(self, offset):
2068 """Return a copy of the position object with its location shifted (PRIVATE).""" 2069 return self.__class__(int(self) + offset, 2070 [p._shift(offset) for p in self.position_choices])
2071
2072 - def _flip(self, length):
2073 """Return a copy of the location after the parent is reversed (PRIVATE).""" 2074 return self.__class__(length - int(self), 2075 [p._flip(length) for p in self.position_choices[::-1]])
2076
2077 2078 -class PositionGap(object):
2079 """Simple class to hold information about a gap between positions.""" 2080
2081 - def __init__(self, gap_size):
2082 """Intialize with a position object containing the gap information.""" 2083 self.gap_size = gap_size
2084
2085 - def __repr__(self):
2086 """Represent the position gap as a string for debugging.""" 2087 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
2088
2089 - def __str__(self):
2090 """Return a representation of the PositionGap object (with python counting).""" 2091 out = "gap(%s)" % self.gap_size 2092 return out
2093 2094 2095 if __name__ == "__main__": 2096 from Bio._utils import run_doctest 2097 run_doctest() 2098