Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke. 
   2  # Copyright 2002-2004 Brad Chapman. 
   3  # Copyright 2006-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a Sequence Record, a sequence with annotation.""" 
   9  __docformat__ = "epytext en"  # Simple markup to show doctests nicely 
  10   
  11  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  12  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  13  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  14  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  15   
  16   
17 -class _RestrictedDict(dict):
18 """Dict which only allows sequences of given length as values (PRIVATE). 19 20 This simple subclass of the Python dictionary is used in the SeqRecord 21 object for holding per-letter-annotations. This class is intended to 22 prevent simple errors by only allowing python sequences (e.g. lists, 23 strings and tuples) to be stored, and only if their length matches that 24 expected (the length of the SeqRecord's seq object). It cannot however 25 prevent the entries being edited in situ (for example appending entries 26 to a list). 27 28 >>> x = _RestrictedDict(5) 29 >>> x["test"] = "hello" 30 >>> x 31 {'test': 'hello'} 32 33 Adding entries which don't have the expected length are blocked: 34 35 >>> x["test"] = "hello world" 36 Traceback (most recent call last): 37 ... 38 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 39 40 The expected length is stored as a private attribute, 41 42 >>> x._length 43 5 44 45 In order that the SeqRecord (and other objects using this class) can be 46 pickled, for example for use in the multiprocessing library, we need to 47 be able to pickle the restricted dictionary objects. 48 49 Using the default protocol, which is 0 on Python 2.x, 50 51 >>> import pickle 52 >>> y = pickle.loads(pickle.dumps(x)) 53 >>> y 54 {'test': 'hello'} 55 >>> y._length 56 5 57 58 Using the highest protocol, which is 2 on Python 2.x, 59 60 >>> import pickle 61 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 62 >>> z 63 {'test': 'hello'} 64 >>> z._length 65 5 66 """ 67
68 - def __init__(self, length):
69 """Create an EMPTY restricted dictionary.""" 70 dict.__init__(self) 71 self._length = int(length)
72
73 - def __setitem__(self, key, value):
74 #The check hasattr(self, "_length") is to cope with pickle protocol 2 75 #I couldn't seem to avoid this with __getstate__ and __setstate__ 76 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 77 or (hasattr(self, "_length") and len(value) != self._length): 78 raise TypeError("We only allow python sequences (lists, tuples or " 79 "strings) of length %i." % self._length) 80 dict.__setitem__(self, key, value)
81
82 - def update(self, new_dict):
83 #Force this to go via our strict __setitem__ method 84 for (key, value) in new_dict.iteritems(): 85 self[key] = value
86 87
88 -class SeqRecord(object):
89 """A SeqRecord object holds a sequence and information about it. 90 91 Main attributes: 92 - id - Identifier such as a locus tag (string) 93 - seq - The sequence itself (Seq object or similar) 94 95 Additional attributes: 96 - name - Sequence name, e.g. gene name (string) 97 - description - Additional text (string) 98 - dbxrefs - List of database cross references (list of strings) 99 - features - Any (sub)features defined (list of SeqFeature objects) 100 - annotations - Further information about the whole sequence (dictionary). 101 Most entries are strings, or lists of strings. 102 - letter_annotations - Per letter/symbol annotation (restricted 103 dictionary). This holds Python sequences (lists, strings 104 or tuples) whose length matches that of the sequence. 105 A typical use would be to hold a list of integers 106 representing sequencing quality scores, or a string 107 representing the secondary structure. 108 109 You will typically use Bio.SeqIO to read in sequences from files as 110 SeqRecord objects. However, you may want to create your own SeqRecord 111 objects directly (see the __init__ method for further details): 112 113 >>> from Bio.Seq import Seq 114 >>> from Bio.SeqRecord import SeqRecord 115 >>> from Bio.Alphabet import IUPAC 116 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 117 ... IUPAC.protein), 118 ... id="YP_025292.1", name="HokC", 119 ... description="toxic membrane protein") 120 >>> print record 121 ID: YP_025292.1 122 Name: HokC 123 Description: toxic membrane protein 124 Number of features: 0 125 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 126 127 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 128 for this. For the special case where you want the SeqRecord turned into 129 a string in a particular file format there is a format method which uses 130 Bio.SeqIO internally: 131 132 >>> print record.format("fasta") 133 >YP_025292.1 toxic membrane protein 134 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 135 <BLANKLINE> 136 137 You can also do things like slicing a SeqRecord, checking its length, etc 138 139 >>> len(record) 140 44 141 >>> edited = record[:10] + record[11:] 142 >>> print edited.seq 143 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 144 >>> print record.seq 145 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 146 147 """
148 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>", 149 description = "<unknown description>", dbxrefs = None, 150 features = None, annotations = None, 151 letter_annotations = None):
152 """Create a SeqRecord. 153 154 Arguments: 155 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 156 - id - Sequence identifier, recommended (string) 157 - name - Sequence name, optional (string) 158 - description - Sequence description, optional (string) 159 - dbxrefs - Database cross references, optional (list of strings) 160 - features - Any (sub)features, optional (list of SeqFeature objects) 161 - annotations - Dictionary of annotations for the whole sequence 162 - letter_annotations - Dictionary of per-letter-annotations, values 163 should be strings, list or tuples of the same 164 length as the full sequence. 165 166 You will typically use Bio.SeqIO to read in sequences from files as 167 SeqRecord objects. However, you may want to create your own SeqRecord 168 objects directly. 169 170 Note that while an id is optional, we strongly recommend you supply a 171 unique id string for each record. This is especially important 172 if you wish to write your sequences to a file. 173 174 If you don't have the actual sequence, but you do know its length, 175 then using the UnknownSeq object from Bio.Seq is appropriate. 176 177 You can create a 'blank' SeqRecord object, and then populate the 178 attributes later. 179 """ 180 if id is not None and not isinstance(id, basestring): 181 #Lots of existing code uses id=None... this may be a bad idea. 182 raise TypeError("id argument should be a string") 183 if not isinstance(name, basestring): 184 raise TypeError("name argument should be a string") 185 if not isinstance(description, basestring): 186 raise TypeError("description argument should be a string") 187 self._seq = seq 188 self.id = id 189 self.name = name 190 self.description = description 191 192 # database cross references (for the whole sequence) 193 if dbxrefs is None: 194 dbxrefs = [] 195 elif not isinstance(dbxrefs, list): 196 raise TypeError("dbxrefs argument should be a list (of strings)") 197 self.dbxrefs = dbxrefs 198 199 # annotations about the whole sequence 200 if annotations is None: 201 annotations = {} 202 elif not isinstance(annotations, dict): 203 raise TypeError("annotations argument should be a dict") 204 self.annotations = annotations 205 206 if letter_annotations is None: 207 # annotations about each letter in the sequence 208 if seq is None: 209 #Should we allow this and use a normal unrestricted dict? 210 self._per_letter_annotations = _RestrictedDict(length=0) 211 else: 212 try: 213 self._per_letter_annotations = \ 214 _RestrictedDict(length=len(seq)) 215 except: 216 raise TypeError("seq argument should be a Seq object or similar") 217 else: 218 #This will be handled via the property set function, which will 219 #turn this into a _RestrictedDict and thus ensure all the values 220 #in the dict are the right length 221 self.letter_annotations = letter_annotations 222 223 # annotations about parts of the sequence 224 if features is None: 225 features = [] 226 elif not isinstance(features, list): 227 raise TypeError("features argument should be a list (of SeqFeature objects)") 228 self.features = features
229 230 #TODO - Just make this a read only property?
231 - def _set_per_letter_annotations(self, value):
232 if not isinstance(value, dict): 233 raise TypeError("The per-letter-annotations should be a " 234 "(restricted) dictionary.") 235 #Turn this into a restricted-dictionary (and check the entries) 236 try: 237 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 238 except AttributeError: 239 #e.g. seq is None 240 self._per_letter_annotations = _RestrictedDict(length=0) 241 self._per_letter_annotations.update(value)
242 letter_annotations = property( 243 fget=lambda self: self._per_letter_annotations, 244 fset=_set_per_letter_annotations, 245 doc="""Dictionary of per-letter-annotation for the sequence. 246 247 For example, this can hold quality scores used in FASTQ or QUAL files. 248 Consider this example using Bio.SeqIO to read in an example Solexa 249 variant FASTQ file as a SeqRecord: 250 251 >>> from Bio import SeqIO 252 >>> handle = open("Quality/solexa_faked.fastq", "rU") 253 >>> record = SeqIO.read(handle, "fastq-solexa") 254 >>> handle.close() 255 >>> print record.id, record.seq 256 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 257 >>> print record.letter_annotations.keys() 258 ['solexa_quality'] 259 >>> print record.letter_annotations["solexa_quality"] 260 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 261 262 The letter_annotations get sliced automatically if you slice the 263 parent SeqRecord, for example taking the last ten bases: 264 265 >>> sub_record = record[-10:] 266 >>> print sub_record.id, sub_record.seq 267 slxa_0001_1_0001_01 ACGTNNNNNN 268 >>> print sub_record.letter_annotations["solexa_quality"] 269 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 270 271 Any python sequence (i.e. list, tuple or string) can be recorded in 272 the SeqRecord's letter_annotations dictionary as long as the length 273 matches that of the SeqRecord's sequence. e.g. 274 275 >>> len(sub_record.letter_annotations) 276 1 277 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 278 >>> len(sub_record.letter_annotations) 279 2 280 281 You can delete entries from the letter_annotations dictionary as usual: 282 283 >>> del sub_record.letter_annotations["solexa_quality"] 284 >>> sub_record.letter_annotations 285 {'dummy': 'abcdefghij'} 286 287 You can completely clear the dictionary easily as follows: 288 289 >>> sub_record.letter_annotations = {} 290 >>> sub_record.letter_annotations 291 {} 292 """) 293
294 - def _set_seq(self, value):
295 #TODO - Add a deprecation warning that the seq should be write only? 296 if self._per_letter_annotations: 297 #TODO - Make this a warning? Silently empty the dictionary? 298 raise ValueError("You must empty the letter annotations first!") 299 self._seq = value 300 try: 301 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 302 except AttributeError: 303 #e.g. seq is None 304 self._per_letter_annotations = _RestrictedDict(length=0)
305 306 seq = property(fget=lambda self: self._seq, 307 fset=_set_seq, 308 doc="The sequence itself, as a Seq or MutableSeq object.") 309
310 - def __getitem__(self, index):
311 """Returns a sub-sequence or an individual letter. 312 313 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 314 that sub-sequence with approriate annotation preserved. The 315 name, id and description are kept. 316 317 Any per-letter-annotations are sliced to match the requested 318 sub-sequence. Unless a stride is used, all those features 319 which fall fully within the subsequence are included (with 320 their locations adjusted accordingly). 321 322 However, the annotations dictionary and the dbxrefs list are 323 not used for the new SeqRecord, as in general they may not 324 apply to the subsequence. If you want to preserve them, you 325 must explictly copy them to the new SeqRecord yourself. 326 327 Using an integer index, e.g. my_record[5] is shorthand for 328 extracting that letter from the sequence, my_record.seq[5]. 329 330 For example, consider this short protein and its secondary 331 structure as encoded by the PDB (e.g. H for alpha helices), 332 plus a simple feature for its histidine self phosphorylation 333 site: 334 335 >>> from Bio.Seq import Seq 336 >>> from Bio.SeqRecord import SeqRecord 337 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 338 >>> from Bio.Alphabet import IUPAC 339 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 340 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 341 ... IUPAC.protein), 342 ... id="1JOY", name="EnvZ", 343 ... description="Homodimeric domain of EnvZ from E. coli") 344 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 345 >>> rec.features.append(SeqFeature(FeatureLocation(20,21), 346 ... type = "Site")) 347 348 Now let's have a quick look at the full record, 349 350 >>> print rec 351 ID: 1JOY 352 Name: EnvZ 353 Description: Homodimeric domain of EnvZ from E. coli 354 Number of features: 1 355 Per letter annotation for: secondary_structure 356 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 357 >>> print rec.letter_annotations["secondary_structure"] 358 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT 359 >>> print rec.features[0].location 360 [20:21] 361 362 Now let's take a sub sequence, here chosen as the first (fractured) 363 alpha helix which includes the histidine phosphorylation site: 364 365 >>> sub = rec[11:41] 366 >>> print sub 367 ID: 1JOY 368 Name: EnvZ 369 Description: Homodimeric domain of EnvZ from E. coli 370 Number of features: 1 371 Per letter annotation for: secondary_structure 372 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 373 >>> print sub.letter_annotations["secondary_structure"] 374 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH 375 >>> print sub.features[0].location 376 [9:10] 377 378 You can also of course omit the start or end values, for 379 example to get the first ten letters only: 380 381 >>> print rec[:10] 382 ID: 1JOY 383 Name: EnvZ 384 Description: Homodimeric domain of EnvZ from E. coli 385 Number of features: 0 386 Per letter annotation for: secondary_structure 387 Seq('MAAGVKQLAD', IUPACProtein()) 388 389 Or for the last ten letters: 390 391 >>> print rec[-10:] 392 ID: 1JOY 393 Name: EnvZ 394 Description: Homodimeric domain of EnvZ from E. coli 395 Number of features: 0 396 Per letter annotation for: secondary_structure 397 Seq('IIEQFIDYLR', IUPACProtein()) 398 399 If you omit both, then you get a copy of the original record (although 400 lacking the annotations and dbxrefs): 401 402 >>> print rec[:] 403 ID: 1JOY 404 Name: EnvZ 405 Description: Homodimeric domain of EnvZ from E. coli 406 Number of features: 1 407 Per letter annotation for: secondary_structure 408 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 409 410 Finally, indexing with a simple integer is shorthand for pulling out 411 that letter from the sequence directly: 412 413 >>> rec[5] 414 'K' 415 >>> rec.seq[5] 416 'K' 417 """ 418 if isinstance(index, int): 419 #NOTE - The sequence level annotation like the id, name, etc 420 #do not really apply to a single character. However, should 421 #we try and expose any per-letter-annotation here? If so how? 422 return self.seq[index] 423 elif isinstance(index, slice): 424 if self.seq is None: 425 raise ValueError("If the sequence is None, we cannot slice it.") 426 parent_length = len(self) 427 answer = self.__class__(self.seq[index], 428 id=self.id, 429 name=self.name, 430 description=self.description) 431 #TODO - The desription may no longer apply. 432 #It would be safer to change it to something 433 #generic like "edited" or the default value. 434 435 #Don't copy the annotation dict and dbxefs list, 436 #they may not apply to a subsequence. 437 #answer.annotations = dict(self.annotations.iteritems()) 438 #answer.dbxrefs = self.dbxrefs[:] 439 #TODO - Review this in light of adding SeqRecord objects? 440 441 #TODO - Cope with strides by generating ambiguous locations? 442 start, stop, step = index.indices(parent_length) 443 if step == 1: 444 #Select relevant features, add them with shifted locations 445 #assert str(self.seq)[index] == str(self.seq)[start:stop] 446 for f in self.features: 447 if f.ref or f.ref_db: 448 #TODO - Implement this (with lots of tests)? 449 import warnings 450 warnings.warn("When slicing SeqRecord objects, any " 451 "SeqFeature referencing other sequences (e.g. " 452 "from segmented GenBank records) are ignored.") 453 continue 454 if start <= f.location.nofuzzy_start \ 455 and f.location.nofuzzy_end <= stop: 456 answer.features.append(f._shift(-start)) 457 458 #Slice all the values to match the sliced sequence 459 #(this should also work with strides, even negative strides): 460 for key, value in self.letter_annotations.iteritems(): 461 answer._per_letter_annotations[key] = value[index] 462 463 return answer 464 raise ValueError("Invalid index")
465
466 - def __iter__(self):
467 """Iterate over the letters in the sequence. 468 469 For example, using Bio.SeqIO to read in a protein FASTA file: 470 471 >>> from Bio import SeqIO 472 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 473 >>> for amino in record: 474 ... print amino 475 ... if amino == "L": break 476 X 477 A 478 G 479 L 480 >>> print record.seq[3] 481 L 482 483 This is just a shortcut for iterating over the sequence directly: 484 485 >>> for amino in record.seq: 486 ... print amino 487 ... if amino == "L": break 488 X 489 A 490 G 491 L 492 >>> print record.seq[3] 493 L 494 495 Note that this does not facilitate iteration together with any 496 per-letter-annotation. However, you can achieve that using the 497 python zip function on the record (or its sequence) and the relevant 498 per-letter-annotation: 499 500 >>> from Bio import SeqIO 501 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"), 502 ... "fastq-solexa") 503 >>> print rec.id, rec.seq 504 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 505 >>> print rec.letter_annotations.keys() 506 ['solexa_quality'] 507 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]): 508 ... if qual > 35: 509 ... print nuc, qual 510 A 40 511 C 39 512 G 38 513 T 37 514 A 36 515 516 You may agree that using zip(rec.seq, ...) is more explicit than using 517 zip(rec, ...) as shown above. 518 """ 519 return iter(self.seq)
520
521 - def __contains__(self, char):
522 """Implements the 'in' keyword, searches the sequence. 523 524 e.g. 525 526 >>> from Bio import SeqIO 527 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 528 >>> "GAATTC" in record 529 False 530 >>> "AAA" in record 531 True 532 533 This essentially acts as a proxy for using "in" on the sequence: 534 535 >>> "GAATTC" in record.seq 536 False 537 >>> "AAA" in record.seq 538 True 539 540 Note that you can also use Seq objects as the query, 541 542 >>> from Bio.Seq import Seq 543 >>> from Bio.Alphabet import generic_dna 544 >>> Seq("AAA") in record 545 True 546 >>> Seq("AAA", generic_dna) in record 547 True 548 549 See also the Seq object's __contains__ method. 550 """ 551 return char in self.seq
552
553 - def __str__(self):
554 """A human readable summary of the record and its annotation (string). 555 556 The python built in function str works by calling the object's ___str__ 557 method. e.g. 558 559 >>> from Bio.Seq import Seq 560 >>> from Bio.SeqRecord import SeqRecord 561 >>> from Bio.Alphabet import IUPAC 562 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 563 ... IUPAC.protein), 564 ... id="YP_025292.1", name="HokC", 565 ... description="toxic membrane protein, small") 566 >>> print str(record) 567 ID: YP_025292.1 568 Name: HokC 569 Description: toxic membrane protein, small 570 Number of features: 0 571 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 572 573 In this example you don't actually need to call str explicity, as the 574 print command does this automatically: 575 576 >>> print record 577 ID: YP_025292.1 578 Name: HokC 579 Description: toxic membrane protein, small 580 Number of features: 0 581 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 582 583 Note that long sequences are shown truncated. 584 """ 585 lines = [] 586 if self.id: 587 lines.append("ID: %s" % self.id) 588 if self.name: 589 lines.append("Name: %s" % self.name) 590 if self.description: 591 lines.append("Description: %s" % self.description) 592 if self.dbxrefs: 593 lines.append("Database cross-references: " 594 + ", ".join(self.dbxrefs)) 595 lines.append("Number of features: %i" % len(self.features)) 596 for a in self.annotations: 597 lines.append("/%s=%s" % (a, str(self.annotations[a]))) 598 if self.letter_annotations: 599 lines.append("Per letter annotation for: " 600 + ", ".join(self.letter_annotations.keys())) 601 #Don't want to include the entire sequence, 602 #and showing the alphabet is useful: 603 lines.append(repr(self.seq)) 604 return "\n".join(lines)
605
606 - def __repr__(self):
607 """A concise summary of the record for debugging (string). 608 609 The python built in function repr works by calling the object's ___repr__ 610 method. e.g. 611 612 >>> from Bio.Seq import Seq 613 >>> from Bio.SeqRecord import SeqRecord 614 >>> from Bio.Alphabet import generic_protein 615 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 616 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 617 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 618 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 619 ... generic_protein), 620 ... id="NP_418483.1", name="b4059", 621 ... description="ssDNA-binding protein", 622 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 623 >>> print repr(rec) 624 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 625 626 At the python prompt you can also use this shorthand: 627 628 >>> rec 629 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 630 631 Note that long sequences are shown truncated. Also note that any 632 annotations, letter_annotations and features are not shown (as they 633 would lead to a very long string). 634 """ 635 return self.__class__.__name__ \ 636 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ 637 % tuple(map(repr, (self.seq, self.id, self.name, 638 self.description, self.dbxrefs)))
639
640 - def format(self, format):
641 r"""Returns the record as a string in the specified file format. 642 643 The format should be a lower case string supported as an output 644 format by Bio.SeqIO, which is used to turn the SeqRecord into a 645 string. e.g. 646 647 >>> from Bio.Seq import Seq 648 >>> from Bio.SeqRecord import SeqRecord 649 >>> from Bio.Alphabet import IUPAC 650 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 651 ... IUPAC.protein), 652 ... id="YP_025292.1", name="HokC", 653 ... description="toxic membrane protein") 654 >>> record.format("fasta") 655 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 656 >>> print record.format("fasta") 657 >YP_025292.1 toxic membrane protein 658 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 659 <BLANKLINE> 660 661 The python print command automatically appends a new line, meaning 662 in this example a blank line is shown. If you look at the string 663 representation you can see there is a trailing new line (shown as 664 slash n) which is important when writing to a file or if 665 concatenating multiple sequence strings together. 666 667 Note that this method will NOT work on every possible file format 668 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 669 """ 670 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 671 #See also the Bio.Align.Generic.Alignment class and its format() 672 return self.__format__(format)
673
674 - def __format__(self, format_spec):
675 """Returns the record as a string in the specified file format. 676 677 This method supports the python format() function added in 678 Python 2.6/3.0. The format_spec should be a lower case string 679 supported by Bio.SeqIO as an output file format. See also the 680 SeqRecord's format() method. 681 """ 682 if not format_spec: 683 #Follow python convention and default to using __str__ 684 return str(self) 685 from Bio import SeqIO 686 if format_spec in SeqIO._BinaryFormats: 687 #Return bytes on Python 3 688 try: 689 #This is in Python 2.6+, but we need it on Python 3 690 from io import BytesIO 691 handle = BytesIO() 692 except ImportError: 693 #Must be on Python 2.5 or older 694 from StringIO import StringIO 695 handle = StringIO() 696 else: 697 from StringIO import StringIO 698 handle = StringIO() 699 SeqIO.write(self, handle, format_spec) 700 return handle.getvalue()
701
702 - def __len__(self):
703 """Returns the length of the sequence. 704 705 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 706 707 >>> from Bio import SeqIO 708 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 709 >>> len(record) 710 309 711 >>> len(record.seq) 712 309 713 """ 714 return len(self.seq)
715
716 - def __nonzero__(self):
717 """Returns True regardless of the length of the sequence. 718 719 This behaviour is for backwards compatibility, since until the 720 __len__ method was added, a SeqRecord always evaluated as True. 721 722 Note that in comparison, a Seq object will evaluate to False if it 723 has a zero length sequence. 724 725 WARNING: The SeqRecord may in future evaluate to False when its 726 sequence is of zero length (in order to better match the Seq 727 object behaviour)! 728 """ 729 return True
730
731 - def __add__(self, other):
732 """Add another sequence or string to this sequence. 733 734 The other sequence can be a SeqRecord object, a Seq object (or 735 similar, e.g. a MutableSeq) or a plain Python string. If you add 736 a plain string or a Seq (like) object, the new SeqRecord will simply 737 have this appended to the existing data. However, any per letter 738 annotation will be lost: 739 740 >>> from Bio import SeqIO 741 >>> handle = open("Quality/solexa_faked.fastq", "rU") 742 >>> record = SeqIO.read(handle, "fastq-solexa") 743 >>> handle.close() 744 >>> print record.id, record.seq 745 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 746 >>> print record.letter_annotations.keys() 747 ['solexa_quality'] 748 749 >>> new = record + "ACT" 750 >>> print new.id, new.seq 751 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 752 >>> print new.letter_annotations.keys() 753 [] 754 755 The new record will attempt to combine the annotation, but for any 756 ambiguities (e.g. different names) it defaults to omitting that 757 annotation. 758 759 >>> from Bio import SeqIO 760 >>> handle = open("GenBank/pBAD30.gb") 761 >>> plasmid = SeqIO.read(handle, "gb") 762 >>> handle.close() 763 >>> print plasmid.id, len(plasmid) 764 pBAD30 4923 765 766 Now let's cut the plasmid into two pieces, and join them back up the 767 other way round (i.e. shift the starting point on this plasmid, have 768 a look at the annotated features in the original file to see why this 769 particular split point might make sense): 770 771 >>> left = plasmid[:3765] 772 >>> right = plasmid[3765:] 773 >>> new = right + left 774 >>> print new.id, len(new) 775 pBAD30 4923 776 >>> str(new.seq) == str(right.seq + left.seq) 777 True 778 >>> len(new.features) == len(left.features) + len(right.features) 779 True 780 781 When we add the left and right SeqRecord objects, their annotation 782 is all consistent, so it is all conserved in the new SeqRecord: 783 784 >>> new.id == left.id == right.id == plasmid.id 785 True 786 >>> new.name == left.name == right.name == plasmid.name 787 True 788 >>> new.description == plasmid.description 789 True 790 >>> new.annotations == left.annotations == right.annotations 791 True 792 >>> new.letter_annotations == plasmid.letter_annotations 793 True 794 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 795 True 796 797 However, we should point out that when we sliced the SeqRecord, 798 any annotations dictionary or dbxrefs list entries were lost. 799 You can explicitly copy them like this: 800 801 >>> new.annotations = plasmid.annotations.copy() 802 >>> new.dbxrefs = plasmid.dbxrefs[:] 803 """ 804 if not isinstance(other, SeqRecord): 805 #Assume it is a string or a Seq. 806 #Note can't transfer any per-letter-annotations 807 return SeqRecord(self.seq + other, 808 id = self.id, name = self.name, 809 description = self.description, 810 features = self.features[:], 811 annotations = self.annotations.copy(), 812 dbxrefs = self.dbxrefs[:]) 813 #Adding two SeqRecord objects... must merge annotation. 814 answer = SeqRecord(self.seq + other.seq, 815 features = self.features[:], 816 dbxrefs = self.dbxrefs[:]) 817 #Will take all the features and all the db cross refs, 818 l = len(self) 819 for f in other.features: 820 answer.features.append(f._shift(l)) 821 del l 822 for ref in other.dbxrefs: 823 if ref not in answer.dbxrefs: 824 answer.dbxrefs.append(ref) 825 #Take common id/name/description/annotation 826 if self.id == other.id: 827 answer.id = self.id 828 if self.name == other.name: 829 answer.name = self.name 830 if self.description == other.description: 831 answer.description = self.description 832 for k, v in self.annotations.iteritems(): 833 if k in other.annotations and other.annotations[k] == v: 834 answer.annotations[k] = v 835 #Can append matching per-letter-annotation 836 for k, v in self.letter_annotations.iteritems(): 837 if k in other.letter_annotations: 838 answer.letter_annotations[k] = v + other.letter_annotations[k] 839 return answer
840
841 - def __radd__(self, other):
842 """Add another sequence or string to this sequence (from the left). 843 844 This method handles adding a Seq object (or similar, e.g. MutableSeq) 845 or a plain Python string (on the left) to a SeqRecord (on the right). 846 See the __add__ method for more details, but for example: 847 848 >>> from Bio import SeqIO 849 >>> handle = open("Quality/solexa_faked.fastq", "rU") 850 >>> record = SeqIO.read(handle, "fastq-solexa") 851 >>> handle.close() 852 >>> print record.id, record.seq 853 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 854 >>> print record.letter_annotations.keys() 855 ['solexa_quality'] 856 857 >>> new = "ACT" + record 858 >>> print new.id, new.seq 859 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 860 >>> print new.letter_annotations.keys() 861 [] 862 """ 863 if isinstance(other, SeqRecord): 864 raise RuntimeError("This should have happened via the __add__ of " 865 "the other SeqRecord being added!") 866 #Assume it is a string or a Seq. 867 #Note can't transfer any per-letter-annotations 868 offset = len(other) 869 return SeqRecord(other + self.seq, 870 id = self.id, name = self.name, 871 description = self.description, 872 features = [f._shift(offset) for f in self.features], 873 annotations = self.annotations.copy(), 874 dbxrefs = self.dbxrefs[:])
875
876 - def upper(self):
877 """Returns a copy of the record with an upper case sequence. 878 879 All the annotation is preserved unchanged. e.g. 880 881 >>> from Bio.Alphabet import generic_dna 882 >>> from Bio.Seq import Seq 883 >>> from Bio.SeqRecord import SeqRecord 884 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 885 ... description = "Made up for this example") 886 >>> record.letter_annotations["phred_quality"] = [1,2,3,4,5,6,7,8] 887 >>> print record.upper().format("fastq") 888 @Test Made up for this example 889 ACGTACGT 890 + 891 "#$%&'() 892 <BLANKLINE> 893 894 Naturally, there is a matching lower method: 895 896 >>> print record.lower().format("fastq") 897 @Test Made up for this example 898 acgtacgt 899 + 900 "#$%&'() 901 <BLANKLINE> 902 """ 903 return SeqRecord(self.seq.upper(), 904 id = self.id, name = self.name, 905 description = self.description, 906 dbxrefs = self.dbxrefs[:], 907 features = self.features[:], 908 annotations = self.annotations.copy(), 909 letter_annotations=self.letter_annotations.copy())
910
911 - def lower(self):
912 """Returns a copy of the record with a lower case sequence. 913 914 All the annotation is preserved unchanged. e.g. 915 916 >>> from Bio import SeqIO 917 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 918 >>> print record.format("fasta") 919 >gi|3298468|dbj|BAA31520.1| SAMIPF 920 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 921 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 922 <BLANKLINE> 923 >>> print record.lower().format("fasta") 924 >gi|3298468|dbj|BAA31520.1| SAMIPF 925 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 926 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 927 <BLANKLINE> 928 929 To take a more annotation rich example, 930 931 >>> from Bio import SeqIO 932 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 933 >>> len(old.features) 934 3 935 >>> new = old.lower() 936 >>> len(old.features) == len(new.features) 937 True 938 >>> old.annotations["organism"] == new.annotations["organism"] 939 True 940 >>> old.dbxrefs == new.dbxrefs 941 True 942 """ 943 return SeqRecord(self.seq.lower(), 944 id = self.id, name = self.name, 945 description = self.description, 946 dbxrefs = self.dbxrefs[:], 947 features = self.features[:], 948 annotations = self.annotations.copy(), 949 letter_annotations=self.letter_annotations.copy())
950
951 - def reverse_complement(self, id=False, name=False, description=False, 952 features=True, annotations=False, 953 letter_annotations=True, dbxrefs=False):
954 """Returns new SeqRecord with reverse complement sequence. 955 956 You can specify the returned record's id, name and description as 957 strings, or True to keep that of the parent, or False for a default. 958 959 You can specify the returned record's features with a list of 960 SeqFeature objects, or True to keep that of the parent, or False to 961 omit them. The default is to keep the original features (with the 962 strand and locations adjusted). 963 964 You can also specify both the returned record's annotations and 965 letter_annotations as dictionaries, True to keep that of the parent, 966 or False to omit them. The default is to keep the original 967 annotations (with the letter annotations reversed). 968 969 To show what happens to the pre-letter annotations, consider an 970 example Solexa variant FASTQ file with a single entry, which we'll 971 read in as a SeqRecord: 972 973 >>> from Bio import SeqIO 974 >>> handle = open("Quality/solexa_faked.fastq", "rU") 975 >>> record = SeqIO.read(handle, "fastq-solexa") 976 >>> handle.close() 977 >>> print record.id, record.seq 978 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 979 >>> print record.letter_annotations.keys() 980 ['solexa_quality'] 981 >>> print record.letter_annotations["solexa_quality"] 982 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 983 984 Now take the reverse complement, 985 986 >>> rc_record = record.reverse_complement(id=record.id+"_rc") 987 >>> print rc_record.id, rc_record.seq 988 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 989 990 Notice that the per-letter-annotations have also been reversed, 991 although this may not be appropriate for all cases. 992 993 >>> print rc_record.letter_annotations["solexa_quality"] 994 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 995 996 Now for the features, we need a different example. Parsing a GenBank 997 file is probably the easiest way to get an nice example with features 998 in it... 999 1000 >>> from Bio import SeqIO 1001 >>> handle = open("GenBank/pBAD30.gb") 1002 >>> plasmid = SeqIO.read(handle, "gb") 1003 >>> handle.close() 1004 >>> print plasmid.id, len(plasmid) 1005 pBAD30 4923 1006 >>> plasmid.seq 1007 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1008 >>> len(plasmid.features) 1009 13 1010 1011 Now, let's take the reverse complement of this whole plasmid: 1012 1013 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1014 >>> print rc_plasmid.id, len(rc_plasmid) 1015 pBAD30_rc 4923 1016 >>> rc_plasmid.seq 1017 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1018 >>> len(rc_plasmid.features) 1019 13 1020 1021 Let's compare the first CDS feature - it has gone from being the 1022 second feature (index 1) to the second last feature (index -2), its 1023 strand has changed, and the location switched round. 1024 1025 >>> print plasmid.features[1] 1026 type: CDS 1027 location: [1081:1960](-) 1028 qualifiers: 1029 Key: label, Value: ['araC'] 1030 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1031 Key: vntifkey, Value: ['4'] 1032 <BLANKLINE> 1033 >>> print rc_plasmid.features[-2] 1034 type: CDS 1035 location: [2963:3842](+) 1036 qualifiers: 1037 Key: label, Value: ['araC'] 1038 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1039 Key: vntifkey, Value: ['4'] 1040 <BLANKLINE> 1041 1042 You can check this new location, based on the length of the plasmid: 1043 1044 >>> len(plasmid) - 1081 1045 3842 1046 >>> len(plasmid) - 1960 1047 2963 1048 1049 Note that if the SeqFeature annotation includes any strand specific 1050 information (e.g. base changes for a SNP), this information is not 1051 ammended, and would need correction after the reverse complement. 1052 1053 Note trying to reverse complement a protein SeqRecord raises an 1054 exception: 1055 1056 >>> from Bio.SeqRecord import SeqRecord 1057 >>> from Bio.Seq import Seq 1058 >>> from Bio.Alphabet import IUPAC 1059 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1060 >>> protein_rec.reverse_complement() 1061 Traceback (most recent call last): 1062 ... 1063 ValueError: Proteins do not have complements! 1064 1065 Also note you can reverse complement a SeqRecord using a MutableSeq: 1066 1067 >>> from Bio.SeqRecord import SeqRecord 1068 >>> from Bio.Seq import MutableSeq 1069 >>> from Bio.Alphabet import generic_dna 1070 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1071 >>> rec.seq[0] = "T" 1072 >>> print rec.id, rec.seq 1073 Test TCGT 1074 >>> rc = rec.reverse_complement(id=True) 1075 >>> print rc.id, rc.seq 1076 Test ACGA 1077 """ 1078 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1079 if isinstance(self.seq, MutableSeq): 1080 #Currently the MutableSeq reverse complement is in situ 1081 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1082 else: 1083 answer = SeqRecord(self.seq.reverse_complement()) 1084 if isinstance(id, basestring): 1085 answer.id = id 1086 elif id: 1087 answer.id = self.id 1088 if isinstance(name, basestring): 1089 answer.name = name 1090 elif name: 1091 answer.name = self.name 1092 if isinstance(description, basestring): 1093 answer.description = description 1094 elif description: 1095 answer.description = self.description 1096 if isinstance(dbxrefs, list): 1097 answer.dbxrefs = dbxrefs 1098 elif dbxrefs: 1099 #Copy the old dbxrefs 1100 answer.dbxrefs = self.dbxrefs[:] 1101 if isinstance(features, list): 1102 answer.features = features 1103 elif features: 1104 #Copy the old features, adjusting location and string 1105 l = len(answer) 1106 answer.features = [f._flip(l) for f in self.features] 1107 #The old list should have been sorted by start location, 1108 #reversing it will leave it sorted by what is now the end position, 1109 #so we need to resort in case of overlapping features. 1110 #NOTE - In the common case of gene before CDS (and similar) with 1111 #the exact same locations, this will still maintain gene before CDS 1112 answer.features.sort(key=lambda x: x.location.start.position) 1113 if isinstance(annotations, dict): 1114 answer.annotations = annotations 1115 elif annotations: 1116 #Copy the old annotations, 1117 answer.annotations = self.annotations.copy() 1118 if isinstance(letter_annotations, dict): 1119 answer.letter_annotations = letter_annotations 1120 elif letter_annotations: 1121 #Copy the old per letter annotations, reversing them 1122 for key, value in self.letter_annotations.iteritems(): 1123 answer._per_letter_annotations[key] = value[::-1] 1124 return answer
1125 1126 1127 if __name__ == "__main__": 1128 from Bio._utils import run_doctest 1129 run_doctest() 1130