Package Bio :: Module SeqFeature
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqFeature

   1  # Copyright 2000-2003 Jeff Chang. 
   2  # Copyright 2001-2008 Brad Chapman. 
   3  # Copyright 2005-2012 by Peter Cock. 
   4  # Copyright 2006-2009 Michiel de Hoon. 
   5  # All rights reserved. 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  """Represent a Sequence Feature holding info about a part of a sequence. 
  10   
  11  This is heavily modeled after the Biocorba SeqFeature objects, and 
  12  may be pretty biased towards GenBank stuff since I'm writing it 
  13  for the GenBank parser output... 
  14   
  15  What's here: 
  16   
  17  Base class to hold a Feature. 
  18  ---------------------------- 
  19  classes: 
  20  o SeqFeature 
  21   
  22  Hold information about a Reference. 
  23  ---------------------------------- 
  24   
  25  This is an attempt to create a General class to hold Reference type 
  26  information. 
  27   
  28  classes: 
  29  o Reference 
  30   
  31  Specify locations of a feature on a Sequence. 
  32  --------------------------------------------- 
  33   
  34  This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in 
  35  much the same way as Biocorba. This has the advantages of allowing us 
  36  to handle fuzzy stuff in case anyone needs it, and also be compatible 
  37  with Biocorba. 
  38   
  39  classes: 
  40  o FeatureLocation - Specify the start and end location of a feature. 
  41   
  42  o ExactPosition - Specify the position as being exact. 
  43  o WithinPosition - Specify a position occuring within some range. 
  44  o BetweenPosition - Specify a position occuring between a range (OBSOLETE?). 
  45  o BeforePosition - Specify the position as being found before some base. 
  46  o AfterPosition - Specify the position as being found after some base. 
  47  o OneOfPosition - Specify a position where the location can be multiple positions. 
  48  o UnknownPosition - Represents missing information like '?' in UniProt. 
  49  """ 
  50   
  51  from Bio.Seq import MutableSeq, reverse_complement 
52 53 54 -class SeqFeature(object):
55 """Represent a Sequence Feature on an object. 56 57 Attributes: 58 o location - the location of the feature on the sequence (FeatureLocation) 59 o type - the specified type of the feature (ie. CDS, exon, repeat...) 60 o location_operator - a string specifying how this SeqFeature may 61 be related to others. For example, in the example GenBank feature 62 shown below, the location_operator would be "join" 63 o strand - A value specifying on which strand (of a DNA sequence, for 64 instance) the feature deals with. 1 indicates the plus strand, -1 65 indicates the minus strand, 0 indicates stranded but unknown (? in GFF3), 66 while the default of None indicates that strand doesn't apply (dot in GFF3, 67 e.g. features on proteins). Note this is a shortcut for accessing the 68 strand property of the feature's location. 69 o id - A string identifier for the feature. 70 o ref - A reference to another sequence. This could be an accession 71 number for some different sequence. Note this is a shortcut for the 72 reference property of the feature's location. 73 o ref_db - A different database for the reference accession number. 74 Note this is a shortcut for the reference property of the location 75 o qualifiers - A dictionary of qualifiers on the feature. These are 76 analogous to the qualifiers from a GenBank feature table. The keys of 77 the dictionary are qualifier names, the values are the qualifier 78 values. 79 o sub_features - Additional SeqFeatures which fall under this 'parent' 80 feature. For instance, if we having something like: 81 82 CDS join(1..10,30..40,50..60) 83 84 Then the top level feature would be of type 'CDS' from 1 to 60 (actually 0 85 to 60 in Python counting) with location_operator='join', and the three sub- 86 features would also be of type 'CDS', and would be from 1 to 10, 30 to 87 40 and 50 to 60, respectively (although actually using Python counting). 88 89 To get the nucleotide sequence for this CDS, you would need to take the 90 parent sequence and do seq[0:10]+seq[29:40]+seq[49:60] (Python counting). 91 Things are more complicated with strands and fuzzy positions. To save you 92 dealing with all these special cases, the SeqFeature provides an extract 93 method to do this for you. 94 """
95 - def __init__(self, location = None, type = '', location_operator = '', 96 strand = None, id = "<unknown id>", 97 qualifiers = None, sub_features = None, 98 ref = None, ref_db = None):
99 """Initialize a SeqFeature on a Sequence. 100 101 location can either be a FeatureLocation (with strand argument also 102 given if required), or None. 103 104 e.g. With no strand, on the forward strand, and on the reverse strand: 105 106 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 107 >>> f1 = SeqFeature(FeatureLocation(5, 10), type="domain") 108 >>> f1.strand == f1.location.strand == None 109 True 110 >>> f2 = SeqFeature(FeatureLocation(7, 110, strand=1), type="CDS") 111 >>> f2.strand == f2.location.strand == +1 112 True 113 >>> f3 = SeqFeature(FeatureLocation(9, 108, strand=-1), type="CDS") 114 >>> f3.strand == f3.location.strand == -1 115 True 116 117 An invalid strand will trigger an exception: 118 119 >>> f4 = SeqFeature(FeatureLocation(50, 60), strand=2) 120 Traceback (most recent call last): 121 ... 122 ValueError: Strand should be +1, -1, 0 or None, not 2 123 124 Similarly if set via the FeatureLocation directly: 125 126 >>> loc4 = FeatureLocation(50, 60, strand=2) 127 Traceback (most recent call last): 128 ... 129 ValueError: Strand should be +1, -1, 0 or None, not 2 130 131 For exact start/end positions, an integer can be used (as shown above) 132 as shorthand for the ExactPosition object. For non-exact locations, the 133 FeatureLocation must be specified via the appropriate position objects. 134 """ 135 if location is not None and not isinstance(location, FeatureLocation): 136 raise TypeError("FeatureLocation (or None) required for the location") 137 self.location = location 138 139 self.type = type 140 self.location_operator = location_operator 141 if strand is not None: 142 self.strand = strand 143 self.id = id 144 if qualifiers is None: 145 qualifiers = {} 146 self.qualifiers = qualifiers 147 if sub_features is None: 148 sub_features = [] 149 self.sub_features = sub_features 150 if ref is not None: 151 self.ref = ref 152 if ref_db is not None: 153 self.ref_db = ref_db
154
155 - def _get_strand(self):
156 return self.location.strand
157
158 - def _set_strand(self, value):
159 try: 160 self.location.strand = value 161 except AttributeError: 162 if self.location is None: 163 if value is not None: 164 raise ValueError("Can't set strand without a location.") 165 else: 166 raise
167 168 strand = property(fget = _get_strand, fset = _set_strand, 169 doc = """Feature's strand 170 171 This is a shortcut for feature.location.strand 172 """) 173
174 - def _get_ref(self):
175 return self.location.ref
176
177 - def _set_ref(self, value):
178 try: 179 self.location.ref = value 180 except AttributeError: 181 if self.location is None: 182 if value is not None: 183 raise ValueError("Can't set ref without a location.") 184 else: 185 raise
186 ref = property(fget = _get_ref, fset = _set_ref, 187 doc = """Feature location reference (e.g. accession). 188 189 This is a shortcut for feature.location.ref 190 """) 191
192 - def _get_ref_db(self):
193 return self.location.ref_db
194
195 - def _set_ref_db(self, value):
196 self.location.ref_db = value
197 ref_db = property(fget = _get_ref_db, fset = _set_ref_db, 198 doc = """Feature location reference's database. 199 200 This is a shortcut for feature.location.ref_db 201 """) 202
203 - def __repr__(self):
204 """A string representation of the record for debugging.""" 205 answer = "%s(%s" % (self.__class__.__name__, repr(self.location)) 206 if self.type: 207 answer += ", type=%s" % repr(self.type) 208 if self.location_operator: 209 answer += ", location_operator=%s" % repr(self.location_operator) 210 if self.id and self.id != "<unknown id>": 211 answer += ", id=%s" % repr(self.id) 212 if self.ref: 213 answer += ", ref=%s" % repr(self.ref) 214 if self.ref_db: 215 answer += ", ref_db=%s" % repr(self.ref_db) 216 answer += ")" 217 return answer
218
219 - def __str__(self):
220 """A readable summary of the feature intended to be printed to screen. 221 """ 222 out = "type: %s\n" % self.type 223 out += "location: %s\n" % self.location 224 if self.id and self.id != "<unknown id>": 225 out += "id: %s\n" % self.id 226 out += "qualifiers: \n" 227 for qual_key in sorted(self.qualifiers): 228 out += " Key: %s, Value: %s\n" % (qual_key, 229 self.qualifiers[qual_key]) 230 if len(self.sub_features) != 0: 231 out += "Sub-Features\n" 232 for sub_feature in self.sub_features: 233 out +="%s\n" % sub_feature 234 return out
235
236 - def _shift(self, offset):
237 """Returns a copy of the feature with its location shifted (PRIVATE). 238 239 The annotation qaulifiers are copied.""" 240 return SeqFeature(location = self.location._shift(offset), 241 type = self.type, 242 location_operator = self.location_operator, 243 id = self.id, 244 qualifiers = dict(self.qualifiers.iteritems()), 245 sub_features = [f._shift(offset) for f in self.sub_features])
246
247 - def _flip(self, length):
248 """Returns a copy of the feature with its location flipped (PRIVATE). 249 250 The argument length gives the length of the parent sequence. For 251 example a location 0..20 (+1 strand) with parent length 30 becomes 252 after flipping 10..30 (-1 strand). Strandless (None) or unknown 253 strand (0) remain like that - just their end points are changed. 254 255 The annotation qaulifiers are copied. 256 """ 257 return SeqFeature(location = self.location._flip(length), 258 type = self.type, 259 location_operator = self.location_operator, 260 id = self.id, 261 qualifiers = dict(self.qualifiers.iteritems()), 262 sub_features = [f._flip(length) for f in self.sub_features[::-1]])
263
264 - def extract(self, parent_sequence):
265 """Extract feature sequence from the supplied parent sequence. 266 267 The parent_sequence can be a Seq like object or a string, and will 268 generally return an object of the same type. The exception to this is 269 a MutableSeq as the parent sequence will return a Seq object. 270 271 This should cope with complex locations including complements, joins 272 and fuzzy positions. Even mixed strand features should work! This 273 also covers features on protein sequences (e.g. domains), although 274 here reverse strand features are not permitted. 275 276 >>> from Bio.Seq import Seq 277 >>> from Bio.Alphabet import generic_protein 278 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 279 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 280 >>> f = SeqFeature(FeatureLocation(8,15), type="domain") 281 >>> f.extract(seq) 282 Seq('VALIVIC', ProteinAlphabet()) 283 284 Note - currently only sub-features of type "join" are supported. 285 """ 286 if isinstance(parent_sequence, MutableSeq): 287 #This avoids complications with reverse complements 288 #(the MutableSeq reverse complement acts in situ) 289 parent_sequence = parent_sequence.toseq() 290 if self.sub_features: 291 if self.location_operator != "join": 292 raise ValueError(self.location_operator) 293 if self.location.strand == -1: 294 #This is a special case given how the GenBank parser works. 295 #Must avoid doing the reverse complement twice. 296 parts = [] 297 for f_sub in self.sub_features[::-1]: 298 assert f_sub.location.strand == -1 299 parts.append(f_sub.location.extract(parent_sequence)) 300 else: 301 #This copes with mixed strand features: 302 parts = [f_sub.location.extract(parent_sequence) 303 for f_sub in self.sub_features] 304 #We use addition rather than a join to avoid alphabet issues: 305 f_seq = parts[0] 306 for part in parts[1:]: 307 f_seq += part 308 return f_seq 309 else: 310 return self.location.extract(parent_sequence)
311
312 - def __nonzero__(self):
313 """Returns True regardless of the length of the feature. 314 315 This behaviour is for backwards compatibility, since until the 316 __len__ method was added, a SeqFeature always evaluated as True. 317 318 Note that in comparison, Seq objects, strings, lists, etc, will all 319 evaluate to False if they have length zero. 320 321 WARNING: The SeqFeature may in future evaluate to False when its 322 length is zero (in order to better match normal python behaviour)! 323 """ 324 return True
325
326 - def __len__(self):
327 """Returns the length of the region described by a feature. 328 329 >>> from Bio.Seq import Seq 330 >>> from Bio.Alphabet import generic_protein 331 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 332 >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) 333 >>> f = SeqFeature(FeatureLocation(8,15), type="domain") 334 >>> len(f) 335 7 336 >>> f.extract(seq) 337 Seq('VALIVIC', ProteinAlphabet()) 338 >>> len(f.extract(seq)) 339 7 340 341 For simple features without subfeatures this is the same as the region 342 spanned (end position minus start position). However, for a feature 343 defined by combining several subfeatures (e.g. a CDS as the join of 344 several exons) the gaps are not counted (e.g. introns). This ensures 345 that len(f) == len(f.extract(parent_seq)), and also makes sure things 346 work properly with features wrapping the origin etc. 347 """ 348 if self.sub_features: 349 return sum(len(f) for f in self.sub_features) 350 else: 351 return len(self.location)
352
353 - def __iter__(self):
354 """Iterate over the parent positions within the feature. 355 356 The iteration order is strand aware, and can be thought of as moving 357 along the feature using the parent sequence coordinates: 358 359 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 360 >>> f = SeqFeature(FeatureLocation(5,10), type="domain", strand=-1) 361 >>> len(f) 362 5 363 >>> for i in f: print i 364 9 365 8 366 7 367 6 368 5 369 >>> list(f) 370 [9, 8, 7, 6, 5] 371 """ 372 if self.sub_features: 373 if self.strand == -1: 374 for f in self.sub_features[::-1]: 375 for i in f.location: 376 yield i 377 else: 378 for f in self.sub_features: 379 for i in f.location: 380 yield i 381 else: 382 for i in self.location: 383 yield i
384
385 - def __contains__(self, value):
386 """Check if an integer position is within the feature. 387 388 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 389 >>> f = SeqFeature(FeatureLocation(5,10), type="domain", strand=-1) 390 >>> len(f) 391 5 392 >>> [i for i in range(15) if i in f] 393 [5, 6, 7, 8, 9] 394 395 For example, to see which features include a SNP position, you could 396 use this: 397 398 >>> from Bio import SeqIO 399 >>> record = SeqIO.read("GenBank/NC_000932.gb", "gb") 400 >>> for f in record.features: 401 ... if 1750 in f: 402 ... print f.type, f.location 403 source [0:154478](+) 404 gene [1716:4347](-) 405 tRNA [1716:4347](-) 406 407 Note that for a feature defined as a join of several subfeatures (e.g. 408 the union of several exons) the gaps are not checked (e.g. introns). 409 In this example, the tRNA location is defined in the GenBank file as 410 complement(join(1717..1751,4311..4347)), so that position 1760 falls 411 in the gap: 412 413 >>> for f in record.features: 414 ... if 1760 in f: 415 ... print f.type, f.location 416 source [0:154478](+) 417 gene [1716:4347](-) 418 419 Note that additional care may be required with fuzzy locations, for 420 example just before a BeforePosition: 421 422 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 423 >>> from Bio.SeqFeature import BeforePosition 424 >>> f = SeqFeature(FeatureLocation(BeforePosition(3),8), type="domain") 425 >>> len(f) 426 5 427 >>> [i for i in range(10) if i in f] 428 [3, 4, 5, 6, 7] 429 """ 430 if not isinstance(value, int): 431 raise ValueError("Currently we only support checking for integer " 432 "positions being within a SeqFeature.") 433 if self.sub_features: 434 for f in self.sub_features: 435 if value in f: 436 return True 437 return False 438 else: 439 return value in self.location
440
441 # --- References 442 443 444 # TODO -- Will this hold PubMed and Medline information decently? 445 -class Reference(object):
446 """Represent a Generic Reference object. 447 448 Attributes: 449 o location - A list of Location objects specifying regions of 450 the sequence that the references correspond to. If no locations are 451 specified, the entire sequence is assumed. 452 o authors - A big old string, or a list split by author, of authors 453 for the reference. 454 o title - The title of the reference. 455 o journal - Journal the reference was published in. 456 o medline_id - A medline reference for the article. 457 o pubmed_id - A pubmed reference for the article. 458 o comment - A place to stick any comments about the reference. 459 """
460 - def __init__(self):
461 self.location = [] 462 self.authors = '' 463 self.consrtm = '' 464 self.title = '' 465 self.journal = '' 466 self.medline_id = '' 467 self.pubmed_id = '' 468 self.comment = ''
469
470 - def __str__(self):
471 """Output an informative string for debugging. 472 """ 473 out = "" 474 for single_location in self.location: 475 out += "location: %s\n" % single_location 476 out += "authors: %s\n" % self.authors 477 if self.consrtm: 478 out += "consrtm: %s\n" % self.consrtm 479 out += "title: %s\n" % self.title 480 out += "journal: %s\n" % self.journal 481 out += "medline id: %s\n" % self.medline_id 482 out += "pubmed id: %s\n" % self.pubmed_id 483 out += "comment: %s\n" % self.comment 484 return out
485
486 - def __repr__(self):
487 #TODO - Update this is __init__ later accpets values 488 return "%s(title=%s, ...)" % (self.__class__.__name__, 489 repr(self.title))
490
491 492 # --- Handling feature locations 493 494 -class FeatureLocation(object):
495 """Specify the location of a feature along a sequence. 496 497 The FeatureLocation is used for simple continous features, which can 498 be described as running from a start position to and end position 499 (optionally with a strand and reference information). More complex 500 locations made up from several non-continuous parts (e.g. a coding 501 sequence made up of several exons) are currently described using a 502 SeqFeature with sub-features. 503 504 Note that the start and end location numbering follow Python's scheme, 505 thus a GenBank entry of 123..150 (one based counting) becomes a location 506 of [122:150] (zero based counting). 507 508 >>> from Bio.SeqFeature import FeatureLocation 509 >>> f = FeatureLocation(122, 150) 510 >>> print f 511 [122:150] 512 >>> print f.start 513 122 514 >>> print f.end 515 150 516 >>> print f.strand 517 None 518 519 Note the strand defaults to None. If you are working with nucleotide 520 sequences you'd want to be explicit if it is the forward strand: 521 522 >>> from Bio.SeqFeature import FeatureLocation 523 >>> f = FeatureLocation(122, 150, strand=+1) 524 >>> print f 525 [122:150](+) 526 >>> print f.strand 527 1 528 529 Note that for a parent sequence of length n, the FeatureLocation 530 start and end must satisfy the inequality 0 <= start <= end <= n. 531 This means even for features on the reverse strand of a nucleotide 532 sequence, we expect the 'start' coordinate to be less than the 533 'end' coordinate. 534 535 >>> from Bio.SeqFeature import FeatureLocation 536 >>> r = FeatureLocation(122, 150, strand=-1) 537 >>> print r 538 [122:150](-) 539 >>> print r.start 540 122 541 >>> print r.end 542 150 543 >>> print r.strand 544 -1 545 546 i.e. Rather than thinking of the 'start' and 'end' biologically in a 547 strand aware manor, think of them as the 'left most' or 'minimum' 548 boundary, and the 'right most' or 'maximum' boundary of the region 549 being described. 550 551 In the example above we have used standard exact positions, but there 552 are also specialised position objects used to represent fuzzy positions 553 as well, for example a GenBank location like complement(<123..150) 554 would use a BeforePosition object for the start. 555 """
556 - def __init__(self, start, end, strand=None, ref=None, ref_db=None):
557 """Specify the start, end, strand etc of a sequence feature. 558 559 start and end arguments specify the values where the feature begins 560 and ends. These can either by any of the *Position objects that 561 inherit from AbstractPosition, or can just be integers specifying the 562 position. In the case of integers, the values are assumed to be 563 exact and are converted in ExactPosition arguments. This is meant 564 to make it easy to deal with non-fuzzy ends. 565 566 i.e. Short form: 567 568 >>> from Bio.SeqFeature import FeatureLocation 569 >>> loc = FeatureLocation(5, 10, strand=-1) 570 >>> print loc 571 [5:10](-) 572 573 Explicit form: 574 575 >>> from Bio.SeqFeature import FeatureLocation, ExactPosition 576 >>> loc = FeatureLocation(ExactPosition(5), ExactPosition(10), strand=-1) 577 >>> print loc 578 [5:10](-) 579 580 Other fuzzy positions are used similarly, 581 582 >>> from Bio.SeqFeature import FeatureLocation 583 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 584 >>> loc2 = FeatureLocation(BeforePosition(5), AfterPosition(10), strand=-1) 585 >>> print loc2 586 [<5:>10](-) 587 588 For nucleotide features you will also want to specify the strand, 589 use 1 for the forward (plus) strand, -1 for the reverse (negative) 590 strand, 0 for stranded but strand unknown (? in GFF3), or None for 591 when the strand does not apply (dot in GFF3), e.g. features on 592 proteins. 593 594 >>> loc = FeatureLocation(5, 10, strand=+1) 595 >>> print loc 596 [5:10](+) 597 >>> print loc.strand 598 1 599 600 Normally feature locations are given relative to the parent 601 sequence you are working with, but an explicit accession can 602 be given with the optional ref and db_ref strings: 603 604 >>> loc = FeatureLocation(105172, 108462, ref="AL391218.9", strand=1) 605 >>> print loc 606 AL391218.9[105172:108462](+) 607 >>> print loc.ref 608 AL391218.9 609 610 """ 611 #TODO - Check 0 <= start <= end (<= length of reference) 612 if isinstance(start, AbstractPosition): 613 self._start = start 614 elif isinstance(start, int) or isinstance(start, long): 615 self._start = ExactPosition(start) 616 else: 617 raise TypeError("start=%r %s" % (start, type(start))) 618 if isinstance(end, AbstractPosition): 619 self._end = end 620 elif isinstance(end, int) or isinstance(end, long): 621 self._end = ExactPosition(end) 622 else: 623 raise TypeError("end=%r %s" % (end, type(end))) 624 self.strand = strand 625 self.ref = ref 626 self.ref_db = ref_db
627
628 - def _get_strand(self):
629 return self._strand
630
631 - def _set_strand(self, value):
632 if value not in [+1, -1, 0, None]: 633 raise ValueError("Strand should be +1, -1, 0 or None, not %r" 634 % value) 635 self._strand = value
636 637 strand = property(fget = _get_strand, fset = _set_strand, 638 doc = "Strand of the location (+1, -1, 0 or None).") 639
640 - def __str__(self):
641 """Returns a representation of the location (with python counting). 642 643 For the simple case this uses the python splicing syntax, [122:150] 644 (zero based counting) which GenBank would call 123..150 (one based 645 counting). 646 """ 647 answer = "[%s:%s]" % (self._start, self._end) 648 if self.ref and self.ref_db: 649 answer = "%s:%s%s" % (self.ref_db, self.ref, answer) 650 elif self.ref: 651 answer = self.ref + answer 652 #Is ref_db without ref meaningful? 653 if self.strand is None: 654 return answer 655 elif self.strand == +1: 656 return answer + "(+)" 657 elif self.strand == -1: 658 return answer + "(-)" 659 else: 660 #strand = 0, stranded but strand unknown, ? in GFF3 661 return answer + "(?)"
662
663 - def __repr__(self):
664 """A string representation of the location for debugging.""" 665 optional = "" 666 if self.strand is not None: 667 optional += ", strand=%r" % self.strand 668 if self.ref is not None: 669 optional += ", ref=%r" % self.ref 670 if self.ref_db is not None: 671 optional += ", ref_db=%r" % self.ref_db 672 return "%s(%r, %r%s)" \ 673 % (self.__class__.__name__, self.start, self.end, optional)
674
675 - def __nonzero__(self):
676 """Returns True regardless of the length of the feature. 677 678 This behaviour is for backwards compatibility, since until the 679 __len__ method was added, a FeatureLocation always evaluated as True. 680 681 Note that in comparison, Seq objects, strings, lists, etc, will all 682 evaluate to False if they have length zero. 683 684 WARNING: The FeatureLocation may in future evaluate to False when its 685 length is zero (in order to better match normal python behaviour)! 686 """ 687 return True
688
689 - def __len__(self):
690 """Returns the length of the region described by the FeatureLocation. 691 692 Note that extra care may be needed for fuzzy locations, e.g. 693 694 >>> from Bio.SeqFeature import FeatureLocation 695 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 696 >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10)) 697 >>> len(loc) 698 5 699 """ 700 return int(self._end) - int(self._start)
701
702 - def __contains__(self, value):
703 """Check if an integer position is within the FeatureLocation. 704 705 Note that extra care may be needed for fuzzy locations, e.g. 706 707 >>> from Bio.SeqFeature import FeatureLocation 708 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 709 >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10)) 710 >>> len(loc) 711 5 712 >>> [i for i in range(15) if i in loc] 713 [5, 6, 7, 8, 9] 714 """ 715 if not isinstance(value, int): 716 raise ValueError("Currently we only support checking for integer " 717 "positions being within a FeatureLocation.") 718 if value < self._start or value >= self._end: 719 return False 720 else: 721 return True
722
723 - def __iter__(self):
724 """Iterate over the parent positions within the FeatureLocation. 725 726 >>> from Bio.SeqFeature import FeatureLocation 727 >>> from Bio.SeqFeature import BeforePosition, AfterPosition 728 >>> loc = FeatureLocation(BeforePosition(5),AfterPosition(10)) 729 >>> len(loc) 730 5 731 >>> for i in loc: print i 732 5 733 6 734 7 735 8 736 9 737 >>> list(loc) 738 [5, 6, 7, 8, 9] 739 >>> [i for i in range(15) if i in loc] 740 [5, 6, 7, 8, 9] 741 742 Note this is strand aware: 743 744 >>> loc = FeatureLocation(BeforePosition(5), AfterPosition(10), strand = -1) 745 >>> list(loc) 746 [9, 8, 7, 6, 5] 747 """ 748 if self.strand == -1: 749 for i in range(self._end - 1, self._start - 1, -1): 750 yield i 751 else: 752 for i in range(self._start, self._end): 753 yield i
754
755 - def _shift(self, offset):
756 """Returns a copy of the location shifted by the offset (PRIVATE).""" 757 if self.ref or self.ref_db: 758 #TODO - Return self? 759 raise ValueError("Feature references another sequence.") 760 return FeatureLocation(start = self._start._shift(offset), 761 end = self._end._shift(offset), 762 strand = self.strand)
763
764 - def _flip(self, length):
765 """Returns a copy of the location after the parent is reversed (PRIVATE).""" 766 if self.ref or self.ref_db: 767 #TODO - Return self? 768 raise ValueError("Feature references another sequence.") 769 #Note this will flip the start and end too! 770 if self.strand == +1: 771 flip_strand = -1 772 elif self.strand == -1: 773 flip_strand = +1 774 else: 775 #0 or None 776 flip_strand = self.strand 777 return FeatureLocation(start = self._end._flip(length), 778 end = self._start._flip(length), 779 strand = flip_strand)
780 781 @property
782 - def start(self):
783 """Start location (integer like, possibly a fuzzy position, read only).""" 784 return self._start
785 786 @property
787 - def end(self):
788 """End location (integer like, possibly a fuzzy position, read only).""" 789 return self._end
790 791 @property
792 - def nofuzzy_start(self):
793 """Start position (integer, approximated if fuzzy, read only) (OBSOLETE). 794 795 This is now a alias for int(feature.start), which should be 796 used in preference -- unless you are trying to support old 797 versions of Biopython. 798 """ 799 return int(self._start)
800 801 @property
802 - def nofuzzy_end(self):
803 """End position (integer, approximated if fuzzy, read only) (OBSOLETE). 804 805 This is now a alias for int(feature.end), which should be 806 used in preference -- unless you are trying to support old 807 versions of Biopython. 808 """ 809 return int(self._end)
810
811 - def extract(self, parent_sequence):
812 """Extract feature sequence from the supplied parent sequence.""" 813 if self.ref or self.ref_db: 814 #TODO - Take a dictionary as an optional argument? 815 raise ValueError("Feature references another sequence.") 816 if isinstance(parent_sequence, MutableSeq): 817 #This avoids complications with reverse complements 818 #(the MutableSeq reverse complement acts in situ) 819 parent_sequence = parent_sequence.toseq() 820 f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] 821 if self.strand == -1: 822 try: 823 f_seq = f_seq.reverse_complement() 824 except AttributeError: 825 assert isinstance(f_seq, str) 826 f_seq = reverse_complement(f_seq) 827 return f_seq
828
829 830 -class AbstractPosition(object):
831 """Abstract base class representing a position. 832 """ 833
834 - def __repr__(self):
835 """String representation of the location for debugging.""" 836 return "%s(...)" % (self.__class__.__name__)
837
838 839 -class ExactPosition(int, AbstractPosition):
840 """Specify the specific position of a boundary. 841 842 o position - The position of the boundary. 843 o extension - An optional argument which must be zero since we don't 844 have an extension. The argument is provided so that the same number of 845 arguments can be passed to all position types. 846 847 In this case, there is no fuzziness associated with the position. 848 849 >>> p = ExactPosition(5) 850 >>> p 851 ExactPosition(5) 852 >>> print p 853 5 854 855 >>> isinstance(p, AbstractPosition) 856 True 857 >>> isinstance(p, int) 858 True 859 860 Integer comparisons and operations should work as expected: 861 862 >>> p == 5 863 True 864 >>> p < 6 865 True 866 >>> p <= 5 867 True 868 >>> p + 10 869 15 870 871 """
872 - def __new__(cls, position, extension = 0):
873 if extension != 0: 874 raise AttributeError("Non-zero extension %s for exact position." 875 % extension) 876 return int.__new__(cls, position)
877
878 - def __repr__(self):
879 """String representation of the ExactPosition location for debugging.""" 880 return "%s(%i)" % (self.__class__.__name__, int(self))
881 882 @property
883 - def position(self):
884 """Legacy attribute to get position as integer (OBSOLETE).""" 885 return int(self)
886 887 @property
888 - def extension(self):
889 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 890 return 0
891
892 - def _shift(self, offset):
893 #By default preserve any subclass 894 return self.__class__(int(self) + offset)
895
896 - def _flip(self, length):
897 #By default perserve any subclass 898 return self.__class__(length - int(self))
899
900 901 -class UncertainPosition(ExactPosition):
902 """Specify a specific position which is uncertain. 903 904 This is used in UniProt, e.g. ?222 for uncertain position 222, or in the 905 XML format explicitly marked as uncertain. Does not apply to GenBank/EMBL. 906 """ 907 pass
908
909 910 -class UnknownPosition(AbstractPosition):
911 """Specify a specific position which is unknown (has no position). 912 913 This is used in UniProt, e.g. ? or in the XML as unknown. 914 """ 915
916 - def __repr__(self):
917 """String representation of the UnknownPosition location for debugging.""" 918 return "%s()" % self.__class__.__name__
919
920 - def __hash__(self):
921 return hash(None)
922 923 @property
924 - def position(self):
925 """Legacy attribute to get position (None) (OBSOLETE).""" 926 return None
927 928 @property
929 - def extension(self):
930 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 931 return 0
932
933 - def _shift(self, offset):
934 return self
935
936 - def _flip(self, length):
937 return self
938
939 940 -class WithinPosition(int, AbstractPosition):
941 """Specify the position of a boundary within some coordinates. 942 943 Arguments: 944 o position - The default integer position 945 o left - The start (left) position of the boundary 946 o right - The end (right) position of the boundary 947 948 This allows dealing with a position like ((1.4)..100). This 949 indicates that the start of the sequence is somewhere between 1 950 and 4. Since this is a start coordinate, it should acts like 951 it is at position 1 (or in Python counting, 0). 952 953 >>> p = WithinPosition(10,10,13) 954 >>> p 955 WithinPosition(10, left=10, right=13) 956 >>> print p 957 (10.13) 958 >>> int(p) 959 10 960 961 Basic integer comparisons and operations should work as though 962 this were a plain integer: 963 964 >>> p == 10 965 True 966 >>> p in [9,10,11] 967 True 968 >>> p < 11 969 True 970 >>> p + 10 971 20 972 973 >>> isinstance(p, WithinPosition) 974 True 975 >>> isinstance(p, AbstractPosition) 976 True 977 >>> isinstance(p, int) 978 True 979 980 Note this also applies for comparison to other position objects, 981 where again the integer behaviour is used: 982 983 >>> p == 10 984 True 985 >>> p == ExactPosition(10) 986 True 987 >>> p == BeforePosition(10) 988 True 989 >>> p == AfterPosition(10) 990 True 991 992 If this were an end point, you would want the position to be 13: 993 994 >>> p2 = WithinPosition(13,10,13) 995 >>> p2 996 WithinPosition(13, left=10, right=13) 997 >>> print p2 998 (10.13) 999 >>> int(p2) 1000 13 1001 >>> p2 == 13 1002 True 1003 >>> p2 == ExactPosition(13) 1004 True 1005 1006 The old legacy properties of position and extension give the 1007 starting/lower/left position as an integer, and the distance 1008 to the ending/higher/right position as an integer. Note that 1009 the position object will act like either the left or the right 1010 end-point depending on how it was created: 1011 1012 >>> p.position == p2.position == 10 1013 True 1014 >>> p.extension == p2.extension == 3 1015 True 1016 >>> int(p) == int(p2) 1017 False 1018 >>> p == 10 1019 True 1020 >>> p2 == 13 1021 True 1022 1023 """
1024 - def __new__(cls, position, left, right):
1025 assert position==left or position==right, \ 1026 "WithinPosition: %r should match left %r or right %r" \ 1027 (position, left, right) 1028 obj = int.__new__(cls, position) 1029 obj._left = left 1030 obj._right = right 1031 return obj
1032
1033 - def __repr__(self):
1034 """String representation of the WithinPosition location for debugging.""" 1035 return "%s(%i, left=%i, right=%i)" \ 1036 % (self.__class__.__name__, int(self), 1037 self._left, self._right)
1038
1039 - def __str__(self):
1040 return "(%s.%s)" % (self._left, self._right)
1041 1042 @property
1043 - def position(self):
1044 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1045 return self._left
1046 1047 @property
1048 - def extension(self):
1049 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1050 return self._right - self._left
1051
1052 - def _shift(self, offset):
1053 return self.__class__(int(self) + offset, 1054 self._left + offset, 1055 self._right + offset)
1056
1057 - def _flip(self, length):
1058 return self.__class__(length - int(self), 1059 length - self._right, 1060 length - self._left)
1061
1062 1063 -class BetweenPosition(int, AbstractPosition):
1064 """Specify the position of a boundary between two coordinates (OBSOLETE?). 1065 1066 Arguments: 1067 o position - The default integer position 1068 o left - The start (left) position of the boundary 1069 o right - The end (right) position of the boundary 1070 1071 This allows dealing with a position like 123^456. This 1072 indicates that the start of the sequence is somewhere between 1073 123 and 456. It is up to the parser to set the position argument 1074 to either boundary point (depending on if this is being used as 1075 a start or end of the feature). For example as a feature end: 1076 1077 >>> p = BetweenPosition(456, 123, 456) 1078 >>> p 1079 BetweenPosition(456, left=123, right=456) 1080 >>> print p 1081 (123^456) 1082 >>> int(p) 1083 456 1084 1085 Integer equality and comparison use the given position, 1086 1087 >>> p == 456 1088 True 1089 >>> p in [455, 456, 457] 1090 True 1091 >>> p > 300 1092 True 1093 1094 The old legacy properties of position and extension give the 1095 starting/lower/left position as an integer, and the distance 1096 to the ending/higher/right position as an integer. Note that 1097 the position object will act like either the left or the right 1098 end-point depending on how it was created: 1099 1100 >>> p2 = BetweenPosition(123, left=123, right=456) 1101 >>> p.position == p2.position == 123 1102 True 1103 >>> p.extension 1104 333 1105 >>> p2.extension 1106 333 1107 >>> p.extension == p2.extension == 333 1108 True 1109 >>> int(p) == int(p2) 1110 False 1111 >>> p == 456 1112 True 1113 >>> p2 == 123 1114 True 1115 1116 Note this potentially surprising behaviour: 1117 1118 >>> BetweenPosition(123, left=123, right=456) == ExactPosition(123) 1119 True 1120 >>> BetweenPosition(123, left=123, right=456) == BeforePosition(123) 1121 True 1122 >>> BetweenPosition(123, left=123, right=456) == AfterPosition(123) 1123 True 1124 1125 i.e. For equality (and sorting) the position objects behave like 1126 integers. 1127 """
1128 - def __new__(cls, position, left, right):
1129 assert position==left or position==right 1130 obj = int.__new__(cls, position) 1131 obj._left = left 1132 obj._right = right 1133 return obj
1134
1135 - def __repr__(self):
1136 """String representation of the WithinPosition location for debugging.""" 1137 return "%s(%i, left=%i, right=%i)" \ 1138 % (self.__class__.__name__, int(self), 1139 self._left, self._right)
1140
1141 - def __str__(self):
1142 return "(%s^%s)" % (self._left, self._right)
1143 1144 @property
1145 - def position(self):
1146 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1147 return self._left
1148 1149 @property
1150 - def extension(self):
1151 """Legacy attribute to get extension (from left to right) as an integer (OBSOLETE).""" 1152 return self._right - self._left
1153
1154 - def _shift(self, offset):
1155 return self.__class__(int(self) + offset, 1156 self._left + offset, 1157 self._right + offset)
1158
1159 - def _flip(self, length):
1160 return self.__class__(length - int(self), 1161 length - self._right, 1162 length - self._left)
1163
1164 1165 -class BeforePosition(int, AbstractPosition):
1166 """Specify a position where the actual location occurs before it. 1167 1168 Arguments: 1169 o position - The upper boundary of where the location can occur. 1170 o extension - An optional argument which must be zero since we don't 1171 have an extension. The argument is provided so that the same number of 1172 arguments can be passed to all position types. 1173 1174 This is used to specify positions like (<10..100) where the location 1175 occurs somewhere before position 10. 1176 1177 >>> p = BeforePosition(5) 1178 >>> p 1179 BeforePosition(5) 1180 >>> print p 1181 <5 1182 >>> int(p) 1183 5 1184 >>> p + 10 1185 15 1186 1187 Note this potentially surprising behaviour: 1188 1189 >>> p == ExactPosition(5) 1190 True 1191 >>> p == AfterPosition(5) 1192 True 1193 1194 Just remember that for equality and sorting the position objects act 1195 like integers. 1196 """ 1197 #Subclasses int so can't use __init__
1198 - def __new__(cls, position, extension = 0):
1199 if extension != 0: 1200 raise AttributeError("Non-zero extension %s for exact position." 1201 % extension) 1202 return int.__new__(cls, position)
1203 1204 @property
1205 - def position(self):
1206 """Legacy attribute to get position as integer (OBSOLETE).""" 1207 return int(self)
1208 1209 @property
1210 - def extension(self):
1211 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1212 return 0
1213
1214 - def __repr__(self):
1215 """A string representation of the location for debugging.""" 1216 return "%s(%i)" % (self.__class__.__name__, int(self))
1217
1218 - def __str__(self):
1219 return "<%s" % self.position
1220
1221 - def _shift(self, offset):
1222 return self.__class__(int(self) + offset)
1223
1224 - def _flip(self, length):
1225 return AfterPosition(length - int(self))
1226
1227 1228 -class AfterPosition(int, AbstractPosition):
1229 """Specify a position where the actual location is found after it. 1230 1231 Arguments: 1232 o position - The lower boundary of where the location can occur. 1233 o extension - An optional argument which must be zero since we don't 1234 have an extension. The argument is provided so that the same number of 1235 arguments can be passed to all position types. 1236 1237 This is used to specify positions like (>10..100) where the location 1238 occurs somewhere after position 10. 1239 1240 >>> p = AfterPosition(7) 1241 >>> p 1242 AfterPosition(7) 1243 >>> print p 1244 >7 1245 >>> int(p) 1246 7 1247 >>> p + 10 1248 17 1249 1250 >>> isinstance(p, AfterPosition) 1251 True 1252 >>> isinstance(p, AbstractPosition) 1253 True 1254 >>> isinstance(p, int) 1255 True 1256 1257 Note this potentially surprising behaviour: 1258 1259 >>> p == ExactPosition(7) 1260 True 1261 >>> p == BeforePosition(7) 1262 True 1263 1264 Just remember that for equality and sorting the position objects act 1265 like integers. 1266 """ 1267 #Subclasses int so can't use __init__
1268 - def __new__(cls, position, extension = 0):
1269 if extension != 0: 1270 raise AttributeError("Non-zero extension %s for exact position." 1271 % extension) 1272 return int.__new__(cls, position)
1273 1274 @property
1275 - def position(self):
1276 """Legacy attribute to get position as integer (OBSOLETE).""" 1277 return int(self)
1278 1279 @property
1280 - def extension(self):
1281 """Legacy attribute to get extension (zero) as integer (OBSOLETE).""" 1282 return 0
1283
1284 - def __repr__(self):
1285 """A string representation of the location for debugging.""" 1286 return "%s(%i)" % (self.__class__.__name__, int(self))
1287
1288 - def __str__(self):
1289 return ">%s" % self.position
1290
1291 - def _shift(self, offset):
1292 return self.__class__(int(self) + offset)
1293
1294 - def _flip(self, length):
1295 return BeforePosition(length - int(self))
1296
1297 1298 -class OneOfPosition(int, AbstractPosition):
1299 """Specify a position where the location can be multiple positions. 1300 1301 This models the GenBank 'one-of(1888,1901)' function, and tries 1302 to make this fit within the Biopython Position models. If this was 1303 a start position it should act like 1888, but as an end position 1901. 1304 1305 >>> p = OneOfPosition(1888, [ExactPosition(1888), ExactPosition(1901)]) 1306 >>> p 1307 OneOfPosition(1888, choices=[ExactPosition(1888), ExactPosition(1901)]) 1308 >>> int(p) 1309 1888 1310 1311 Interget comparisons and operators act like using int(p), 1312 1313 >>> p == 1888 1314 True 1315 >>> p <= 1888 1316 True 1317 >>> p > 1888 1318 False 1319 >>> p + 100 1320 1988 1321 1322 >>> isinstance(p, OneOfPosition) 1323 True 1324 >>> isinstance(p, AbstractPosition) 1325 True 1326 >>> isinstance(p, int) 1327 True 1328 1329 The old legacy properties of position and extension give the 1330 starting/lowest/left-most position as an integer, and the 1331 distance to the ending/highest/right-most position as an integer. 1332 Note that the position object will act like one of the list of 1333 possible locations depending on how it was created: 1334 1335 >>> p2 = OneOfPosition(1901, [ExactPosition(1888), ExactPosition(1901)]) 1336 >>> p.position == p2.position == 1888 1337 True 1338 >>> p.extension == p2.extension == 13 1339 True 1340 >>> int(p) == int(p2) 1341 False 1342 >>> p == 1888 1343 True 1344 >>> p2 == 1901 1345 True 1346 1347 """
1348 - def __new__(cls, position, choices):
1349 """Initialize with a set of posssible positions. 1350 1351 position_list is a list of AbstractPosition derived objects, 1352 specifying possible locations. 1353 1354 position is an integer specifying the default behaviour. 1355 """ 1356 assert position in choices, \ 1357 "OneOfPosition: %r should match one of %r" % (position, choices) 1358 obj = int.__new__(cls, position) 1359 obj.position_choices = choices 1360 return obj
1361 1362 @property
1363 - def position(self):
1364 """Legacy attribute to get (left) position as integer (OBSOLETE).""" 1365 return min(int(pos) for pos in self.position_choices)
1366 1367 @property
1368 - def extension(self):
1369 """Legacy attribute to get extension as integer (OBSOLETE).""" 1370 positions = [int(pos) for pos in self.position_choices] 1371 return max(positions) - min(positions)
1372
1373 - def __repr__(self):
1374 """String representation of the OneOfPosition location for debugging.""" 1375 return "%s(%i, choices=%r)" % (self.__class__.__name__, 1376 int(self), self.position_choices)
1377
1378 - def __str__(self):
1379 out = "one-of(" 1380 for position in self.position_choices: 1381 out += "%s," % position 1382 # replace the last comma with the closing parenthesis 1383 out = out[:-1] + ")" 1384 return out
1385
1386 - def _shift(self, offset):
1387 return self.__class__(int(self) + offset, 1388 [p._shift(offset) for p in self.position_choices])
1389
1390 - def _flip(self, length):
1391 return self.__class__(length - int(self), 1392 [p._flip(length) for p in self.position_choices[::-1]])
1393
1394 1395 -class PositionGap(object):
1396 """Simple class to hold information about a gap between positions. 1397 """
1398 - def __init__(self, gap_size):
1399 """Intialize with a position object containing the gap information. 1400 """ 1401 self.gap_size = gap_size
1402
1403 - def __repr__(self):
1404 """A string representation of the position gap for debugging.""" 1405 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
1406
1407 - def __str__(self):
1408 out = "gap(%s)" % self.gap_size 1409 return out
1410 1411 1412 if __name__ == "__main__": 1413 from Bio._utils import run_doctest 1414 run_doctest() 1415