Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke. 
   2  # Copyright 2002-2004 Brad Chapman. 
   3  # Copyright 2006-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a Sequence Record, a sequence with annotation.""" 
   9   
  10   
  11  from Bio._py3k import basestring 
  12   
  13  __docformat__ = "epytext en"  # Simple markup to show doctests nicely 
  14   
  15  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  16  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  17  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  18  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  19   
  20   
21 -class _RestrictedDict(dict):
22 """Dict which only allows sequences of given length as values (PRIVATE). 23 24 This simple subclass of the Python dictionary is used in the SeqRecord 25 object for holding per-letter-annotations. This class is intended to 26 prevent simple errors by only allowing python sequences (e.g. lists, 27 strings and tuples) to be stored, and only if their length matches that 28 expected (the length of the SeqRecord's seq object). It cannot however 29 prevent the entries being edited in situ (for example appending entries 30 to a list). 31 32 >>> x = _RestrictedDict(5) 33 >>> x["test"] = "hello" 34 >>> x 35 {'test': 'hello'} 36 37 Adding entries which don't have the expected length are blocked: 38 39 >>> x["test"] = "hello world" 40 Traceback (most recent call last): 41 ... 42 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 43 44 The expected length is stored as a private attribute, 45 46 >>> x._length 47 5 48 49 In order that the SeqRecord (and other objects using this class) can be 50 pickled, for example for use in the multiprocessing library, we need to 51 be able to pickle the restricted dictionary objects. 52 53 Using the default protocol, which is 0 on Python 2.x, 54 55 >>> import pickle 56 >>> y = pickle.loads(pickle.dumps(x)) 57 >>> y 58 {'test': 'hello'} 59 >>> y._length 60 5 61 62 Using the highest protocol, which is 2 on Python 2.x, 63 64 >>> import pickle 65 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 66 >>> z 67 {'test': 'hello'} 68 >>> z._length 69 5 70 """ 71
72 - def __init__(self, length):
73 """Create an EMPTY restricted dictionary.""" 74 dict.__init__(self) 75 self._length = int(length)
76
77 - def __setitem__(self, key, value):
78 #The check hasattr(self, "_length") is to cope with pickle protocol 2 79 #I couldn't seem to avoid this with __getstate__ and __setstate__ 80 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 81 or (hasattr(self, "_length") and len(value) != self._length): 82 raise TypeError("We only allow python sequences (lists, tuples or " 83 "strings) of length %i." % self._length) 84 dict.__setitem__(self, key, value)
85
86 - def update(self, new_dict):
87 #Force this to go via our strict __setitem__ method 88 for (key, value) in new_dict.items(): 89 self[key] = value
90 91
92 -class SeqRecord(object):
93 """A SeqRecord object holds a sequence and information about it. 94 95 Main attributes: 96 - id - Identifier such as a locus tag (string) 97 - seq - The sequence itself (Seq object or similar) 98 99 Additional attributes: 100 - name - Sequence name, e.g. gene name (string) 101 - description - Additional text (string) 102 - dbxrefs - List of database cross references (list of strings) 103 - features - Any (sub)features defined (list of SeqFeature objects) 104 - annotations - Further information about the whole sequence (dictionary). 105 Most entries are strings, or lists of strings. 106 - letter_annotations - Per letter/symbol annotation (restricted 107 dictionary). This holds Python sequences (lists, strings 108 or tuples) whose length matches that of the sequence. 109 A typical use would be to hold a list of integers 110 representing sequencing quality scores, or a string 111 representing the secondary structure. 112 113 You will typically use Bio.SeqIO to read in sequences from files as 114 SeqRecord objects. However, you may want to create your own SeqRecord 115 objects directly (see the __init__ method for further details): 116 117 >>> from Bio.Seq import Seq 118 >>> from Bio.SeqRecord import SeqRecord 119 >>> from Bio.Alphabet import IUPAC 120 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 121 ... IUPAC.protein), 122 ... id="YP_025292.1", name="HokC", 123 ... description="toxic membrane protein") 124 >>> print(record) 125 ID: YP_025292.1 126 Name: HokC 127 Description: toxic membrane protein 128 Number of features: 0 129 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 130 131 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 132 for this. For the special case where you want the SeqRecord turned into 133 a string in a particular file format there is a format method which uses 134 Bio.SeqIO internally: 135 136 >>> print(record.format("fasta")) 137 >YP_025292.1 toxic membrane protein 138 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 139 <BLANKLINE> 140 141 You can also do things like slicing a SeqRecord, checking its length, etc 142 143 >>> len(record) 144 44 145 >>> edited = record[:10] + record[11:] 146 >>> print(edited.seq) 147 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 148 >>> print(record.seq) 149 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 150 151 """
152 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>", 153 description = "<unknown description>", dbxrefs = None, 154 features = None, annotations = None, 155 letter_annotations = None):
156 """Create a SeqRecord. 157 158 Arguments: 159 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 160 - id - Sequence identifier, recommended (string) 161 - name - Sequence name, optional (string) 162 - description - Sequence description, optional (string) 163 - dbxrefs - Database cross references, optional (list of strings) 164 - features - Any (sub)features, optional (list of SeqFeature objects) 165 - annotations - Dictionary of annotations for the whole sequence 166 - letter_annotations - Dictionary of per-letter-annotations, values 167 should be strings, list or tuples of the same 168 length as the full sequence. 169 170 You will typically use Bio.SeqIO to read in sequences from files as 171 SeqRecord objects. However, you may want to create your own SeqRecord 172 objects directly. 173 174 Note that while an id is optional, we strongly recommend you supply a 175 unique id string for each record. This is especially important 176 if you wish to write your sequences to a file. 177 178 If you don't have the actual sequence, but you do know its length, 179 then using the UnknownSeq object from Bio.Seq is appropriate. 180 181 You can create a 'blank' SeqRecord object, and then populate the 182 attributes later. 183 """ 184 if id is not None and not isinstance(id, basestring): 185 #Lots of existing code uses id=None... this may be a bad idea. 186 raise TypeError("id argument should be a string") 187 if not isinstance(name, basestring): 188 raise TypeError("name argument should be a string") 189 if not isinstance(description, basestring): 190 raise TypeError("description argument should be a string") 191 self._seq = seq 192 self.id = id 193 self.name = name 194 self.description = description 195 196 # database cross references (for the whole sequence) 197 if dbxrefs is None: 198 dbxrefs = [] 199 elif not isinstance(dbxrefs, list): 200 raise TypeError("dbxrefs argument should be a list (of strings)") 201 self.dbxrefs = dbxrefs 202 203 # annotations about the whole sequence 204 if annotations is None: 205 annotations = {} 206 elif not isinstance(annotations, dict): 207 raise TypeError("annotations argument should be a dict") 208 self.annotations = annotations 209 210 if letter_annotations is None: 211 # annotations about each letter in the sequence 212 if seq is None: 213 #Should we allow this and use a normal unrestricted dict? 214 self._per_letter_annotations = _RestrictedDict(length=0) 215 else: 216 try: 217 self._per_letter_annotations = \ 218 _RestrictedDict(length=len(seq)) 219 except: 220 raise TypeError("seq argument should be a Seq object or similar") 221 else: 222 #This will be handled via the property set function, which will 223 #turn this into a _RestrictedDict and thus ensure all the values 224 #in the dict are the right length 225 self.letter_annotations = letter_annotations 226 227 # annotations about parts of the sequence 228 if features is None: 229 features = [] 230 elif not isinstance(features, list): 231 raise TypeError("features argument should be a list (of SeqFeature objects)") 232 self.features = features
233 234 #TODO - Just make this a read only property?
235 - def _set_per_letter_annotations(self, value):
236 if not isinstance(value, dict): 237 raise TypeError("The per-letter-annotations should be a " 238 "(restricted) dictionary.") 239 #Turn this into a restricted-dictionary (and check the entries) 240 try: 241 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 242 except AttributeError: 243 #e.g. seq is None 244 self._per_letter_annotations = _RestrictedDict(length=0) 245 self._per_letter_annotations.update(value)
246 letter_annotations = property( 247 fget=lambda self: self._per_letter_annotations, 248 fset=_set_per_letter_annotations, 249 doc="""Dictionary of per-letter-annotation for the sequence. 250 251 For example, this can hold quality scores used in FASTQ or QUAL files. 252 Consider this example using Bio.SeqIO to read in an example Solexa 253 variant FASTQ file as a SeqRecord: 254 255 >>> from Bio import SeqIO 256 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 257 >>> print("%s %s" % (record.id, record.seq)) 258 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 259 >>> print(list(record.letter_annotations)) 260 ['solexa_quality'] 261 >>> print(record.letter_annotations["solexa_quality"]) 262 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 263 264 The letter_annotations get sliced automatically if you slice the 265 parent SeqRecord, for example taking the last ten bases: 266 267 >>> sub_record = record[-10:] 268 >>> print("%s %s" % (sub_record.id, sub_record.seq)) 269 slxa_0001_1_0001_01 ACGTNNNNNN 270 >>> print(sub_record.letter_annotations["solexa_quality"]) 271 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 272 273 Any python sequence (i.e. list, tuple or string) can be recorded in 274 the SeqRecord's letter_annotations dictionary as long as the length 275 matches that of the SeqRecord's sequence. e.g. 276 277 >>> len(sub_record.letter_annotations) 278 1 279 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 280 >>> len(sub_record.letter_annotations) 281 2 282 283 You can delete entries from the letter_annotations dictionary as usual: 284 285 >>> del sub_record.letter_annotations["solexa_quality"] 286 >>> sub_record.letter_annotations 287 {'dummy': 'abcdefghij'} 288 289 You can completely clear the dictionary easily as follows: 290 291 >>> sub_record.letter_annotations = {} 292 >>> sub_record.letter_annotations 293 {} 294 """) 295
296 - def _set_seq(self, value):
297 #TODO - Add a deprecation warning that the seq should be write only? 298 if self._per_letter_annotations: 299 #TODO - Make this a warning? Silently empty the dictionary? 300 raise ValueError("You must empty the letter annotations first!") 301 self._seq = value 302 try: 303 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 304 except AttributeError: 305 #e.g. seq is None 306 self._per_letter_annotations = _RestrictedDict(length=0)
307 308 seq = property(fget=lambda self: self._seq, 309 fset=_set_seq, 310 doc="The sequence itself, as a Seq or MutableSeq object.") 311
312 - def __getitem__(self, index):
313 """Returns a sub-sequence or an individual letter. 314 315 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 316 that sub-sequence with approriate annotation preserved. The 317 name, id and description are kept. 318 319 Any per-letter-annotations are sliced to match the requested 320 sub-sequence. Unless a stride is used, all those features 321 which fall fully within the subsequence are included (with 322 their locations adjusted accordingly). 323 324 However, the annotations dictionary and the dbxrefs list are 325 not used for the new SeqRecord, as in general they may not 326 apply to the subsequence. If you want to preserve them, you 327 must explictly copy them to the new SeqRecord yourself. 328 329 Using an integer index, e.g. my_record[5] is shorthand for 330 extracting that letter from the sequence, my_record.seq[5]. 331 332 For example, consider this short protein and its secondary 333 structure as encoded by the PDB (e.g. H for alpha helices), 334 plus a simple feature for its histidine self phosphorylation 335 site: 336 337 >>> from Bio.Seq import Seq 338 >>> from Bio.SeqRecord import SeqRecord 339 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 340 >>> from Bio.Alphabet import IUPAC 341 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 342 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 343 ... IUPAC.protein), 344 ... id="1JOY", name="EnvZ", 345 ... description="Homodimeric domain of EnvZ from E. coli") 346 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 347 >>> rec.features.append(SeqFeature(FeatureLocation(20, 21), 348 ... type = "Site")) 349 350 Now let's have a quick look at the full record, 351 352 >>> print(rec) 353 ID: 1JOY 354 Name: EnvZ 355 Description: Homodimeric domain of EnvZ from E. coli 356 Number of features: 1 357 Per letter annotation for: secondary_structure 358 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 359 >>> print(rec.letter_annotations["secondary_structure"]) 360 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT 361 >>> print(rec.features[0].location) 362 [20:21] 363 364 Now let's take a sub sequence, here chosen as the first (fractured) 365 alpha helix which includes the histidine phosphorylation site: 366 367 >>> sub = rec[11:41] 368 >>> print(sub) 369 ID: 1JOY 370 Name: EnvZ 371 Description: Homodimeric domain of EnvZ from E. coli 372 Number of features: 1 373 Per letter annotation for: secondary_structure 374 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 375 >>> print(sub.letter_annotations["secondary_structure"]) 376 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH 377 >>> print(sub.features[0].location) 378 [9:10] 379 380 You can also of course omit the start or end values, for 381 example to get the first ten letters only: 382 383 >>> print(rec[:10]) 384 ID: 1JOY 385 Name: EnvZ 386 Description: Homodimeric domain of EnvZ from E. coli 387 Number of features: 0 388 Per letter annotation for: secondary_structure 389 Seq('MAAGVKQLAD', IUPACProtein()) 390 391 Or for the last ten letters: 392 393 >>> print(rec[-10:]) 394 ID: 1JOY 395 Name: EnvZ 396 Description: Homodimeric domain of EnvZ from E. coli 397 Number of features: 0 398 Per letter annotation for: secondary_structure 399 Seq('IIEQFIDYLR', IUPACProtein()) 400 401 If you omit both, then you get a copy of the original record (although 402 lacking the annotations and dbxrefs): 403 404 >>> print(rec[:]) 405 ID: 1JOY 406 Name: EnvZ 407 Description: Homodimeric domain of EnvZ from E. coli 408 Number of features: 1 409 Per letter annotation for: secondary_structure 410 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 411 412 Finally, indexing with a simple integer is shorthand for pulling out 413 that letter from the sequence directly: 414 415 >>> rec[5] 416 'K' 417 >>> rec.seq[5] 418 'K' 419 """ 420 if isinstance(index, int): 421 #NOTE - The sequence level annotation like the id, name, etc 422 #do not really apply to a single character. However, should 423 #we try and expose any per-letter-annotation here? If so how? 424 return self.seq[index] 425 elif isinstance(index, slice): 426 if self.seq is None: 427 raise ValueError("If the sequence is None, we cannot slice it.") 428 parent_length = len(self) 429 answer = self.__class__(self.seq[index], 430 id=self.id, 431 name=self.name, 432 description=self.description) 433 #TODO - The desription may no longer apply. 434 #It would be safer to change it to something 435 #generic like "edited" or the default value. 436 437 #Don't copy the annotation dict and dbxefs list, 438 #they may not apply to a subsequence. 439 #answer.annotations = dict(self.annotations.items()) 440 #answer.dbxrefs = self.dbxrefs[:] 441 #TODO - Review this in light of adding SeqRecord objects? 442 443 #TODO - Cope with strides by generating ambiguous locations? 444 start, stop, step = index.indices(parent_length) 445 if step == 1: 446 #Select relevant features, add them with shifted locations 447 #assert str(self.seq)[index] == str(self.seq)[start:stop] 448 for f in self.features: 449 if f.ref or f.ref_db: 450 #TODO - Implement this (with lots of tests)? 451 import warnings 452 warnings.warn("When slicing SeqRecord objects, any " 453 "SeqFeature referencing other sequences (e.g. " 454 "from segmented GenBank records) are ignored.") 455 continue 456 if start <= f.location.nofuzzy_start \ 457 and f.location.nofuzzy_end <= stop: 458 answer.features.append(f._shift(-start)) 459 460 #Slice all the values to match the sliced sequence 461 #(this should also work with strides, even negative strides): 462 for key, value in self.letter_annotations.items(): 463 answer._per_letter_annotations[key] = value[index] 464 465 return answer 466 raise ValueError("Invalid index")
467
468 - def __iter__(self):
469 """Iterate over the letters in the sequence. 470 471 For example, using Bio.SeqIO to read in a protein FASTA file: 472 473 >>> from Bio import SeqIO 474 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 475 >>> for amino in record: 476 ... print(amino) 477 ... if amino == "L": break 478 X 479 A 480 G 481 L 482 >>> print(record.seq[3]) 483 L 484 485 This is just a shortcut for iterating over the sequence directly: 486 487 >>> for amino in record.seq: 488 ... print(amino) 489 ... if amino == "L": break 490 X 491 A 492 G 493 L 494 >>> print(record.seq[3]) 495 L 496 497 Note that this does not facilitate iteration together with any 498 per-letter-annotation. However, you can achieve that using the 499 python zip function on the record (or its sequence) and the relevant 500 per-letter-annotation: 501 502 >>> from Bio import SeqIO 503 >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 504 >>> print("%s %s" % (rec.id, rec.seq)) 505 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 506 >>> print(list(rec.letter_annotations)) 507 ['solexa_quality'] 508 >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): 509 ... if qual > 35: 510 ... print("%s %i" % (nuc, qual)) 511 A 40 512 C 39 513 G 38 514 T 37 515 A 36 516 517 You may agree that using zip(rec.seq, ...) is more explicit than using 518 zip(rec, ...) as shown above. 519 """ 520 return iter(self.seq)
521
522 - def __contains__(self, char):
523 """Implements the 'in' keyword, searches the sequence. 524 525 e.g. 526 527 >>> from Bio import SeqIO 528 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 529 >>> "GAATTC" in record 530 False 531 >>> "AAA" in record 532 True 533 534 This essentially acts as a proxy for using "in" on the sequence: 535 536 >>> "GAATTC" in record.seq 537 False 538 >>> "AAA" in record.seq 539 True 540 541 Note that you can also use Seq objects as the query, 542 543 >>> from Bio.Seq import Seq 544 >>> from Bio.Alphabet import generic_dna 545 >>> Seq("AAA") in record 546 True 547 >>> Seq("AAA", generic_dna) in record 548 True 549 550 See also the Seq object's __contains__ method. 551 """ 552 return char in self.seq
553
554 - def __str__(self):
555 """A human readable summary of the record and its annotation (string). 556 557 The python built in function str works by calling the object's ___str__ 558 method. e.g. 559 560 >>> from Bio.Seq import Seq 561 >>> from Bio.SeqRecord import SeqRecord 562 >>> from Bio.Alphabet import IUPAC 563 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 564 ... IUPAC.protein), 565 ... id="YP_025292.1", name="HokC", 566 ... description="toxic membrane protein, small") 567 >>> print(str(record)) 568 ID: YP_025292.1 569 Name: HokC 570 Description: toxic membrane protein, small 571 Number of features: 0 572 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 573 574 In this example you don't actually need to call str explicity, as the 575 print command does this automatically: 576 577 >>> print(record) 578 ID: YP_025292.1 579 Name: HokC 580 Description: toxic membrane protein, small 581 Number of features: 0 582 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 583 584 Note that long sequences are shown truncated. 585 """ 586 lines = [] 587 if self.id: 588 lines.append("ID: %s" % self.id) 589 if self.name: 590 lines.append("Name: %s" % self.name) 591 if self.description: 592 lines.append("Description: %s" % self.description) 593 if self.dbxrefs: 594 lines.append("Database cross-references: " 595 + ", ".join(self.dbxrefs)) 596 lines.append("Number of features: %i" % len(self.features)) 597 for a in self.annotations: 598 lines.append("/%s=%s" % (a, str(self.annotations[a]))) 599 if self.letter_annotations: 600 lines.append("Per letter annotation for: " 601 + ", ".join(self.letter_annotations)) 602 #Don't want to include the entire sequence, 603 #and showing the alphabet is useful: 604 lines.append(repr(self.seq)) 605 return "\n".join(lines)
606
607 - def __repr__(self):
608 """A concise summary of the record for debugging (string). 609 610 The python built in function repr works by calling the object's ___repr__ 611 method. e.g. 612 613 >>> from Bio.Seq import Seq 614 >>> from Bio.SeqRecord import SeqRecord 615 >>> from Bio.Alphabet import generic_protein 616 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 617 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 618 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 619 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 620 ... generic_protein), 621 ... id="NP_418483.1", name="b4059", 622 ... description="ssDNA-binding protein", 623 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 624 >>> print(repr(rec)) 625 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 626 627 At the python prompt you can also use this shorthand: 628 629 >>> rec 630 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 631 632 Note that long sequences are shown truncated. Also note that any 633 annotations, letter_annotations and features are not shown (as they 634 would lead to a very long string). 635 """ 636 return self.__class__.__name__ \ 637 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ 638 % tuple(map(repr, (self.seq, self.id, self.name, 639 self.description, self.dbxrefs)))
640
641 - def format(self, format):
642 r"""Returns the record as a string in the specified file format. 643 644 The format should be a lower case string supported as an output 645 format by Bio.SeqIO, which is used to turn the SeqRecord into a 646 string. e.g. 647 648 >>> from Bio.Seq import Seq 649 >>> from Bio.SeqRecord import SeqRecord 650 >>> from Bio.Alphabet import IUPAC 651 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 652 ... IUPAC.protein), 653 ... id="YP_025292.1", name="HokC", 654 ... description="toxic membrane protein") 655 >>> record.format("fasta") 656 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 657 >>> print(record.format("fasta")) 658 >YP_025292.1 toxic membrane protein 659 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 660 <BLANKLINE> 661 662 The python print command automatically appends a new line, meaning 663 in this example a blank line is shown. If you look at the string 664 representation you can see there is a trailing new line (shown as 665 slash n) which is important when writing to a file or if 666 concatenating multiple sequence strings together. 667 668 Note that this method will NOT work on every possible file format 669 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 670 """ 671 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 672 #See also the Bio.Align.Generic.Alignment class and its format() 673 return self.__format__(format)
674
675 - def __format__(self, format_spec):
676 """Returns the record as a string in the specified file format. 677 678 This method supports the python format() function added in 679 Python 2.6/3.0. The format_spec should be a lower case string 680 supported by Bio.SeqIO as an output file format. See also the 681 SeqRecord's format() method. 682 683 Under Python 3 please note that for binary formats a bytes 684 string is returned, otherwise a (unicode) string is returned. 685 """ 686 if not format_spec: 687 #Follow python convention and default to using __str__ 688 return str(self) 689 from Bio import SeqIO 690 if format_spec in SeqIO._BinaryFormats: 691 #Return bytes on Python 3 692 from io import BytesIO 693 handle = BytesIO() 694 else: 695 from Bio._py3k import StringIO 696 handle = StringIO() 697 SeqIO.write(self, handle, format_spec) 698 return handle.getvalue()
699
700 - def __len__(self):
701 """Returns the length of the sequence. 702 703 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 704 705 >>> from Bio import SeqIO 706 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 707 >>> len(record) 708 309 709 >>> len(record.seq) 710 309 711 """ 712 return len(self.seq)
713 714 #Python 3:
715 - def __bool__(self):
716 """Boolean value of an instance of this class (True). 717 718 This behaviour is for backwards compatibility, since until the 719 __len__ method was added, a SeqRecord always evaluated as True. 720 721 Note that in comparison, a Seq object will evaluate to False if it 722 has a zero length sequence. 723 724 WARNING: The SeqRecord may in future evaluate to False when its 725 sequence is of zero length (in order to better match the Seq 726 object behaviour)! 727 """ 728 return True
729 730 #Python 2: 731 __nonzero__= __bool__ 732
733 - def __add__(self, other):
734 """Add another sequence or string to this sequence. 735 736 The other sequence can be a SeqRecord object, a Seq object (or 737 similar, e.g. a MutableSeq) or a plain Python string. If you add 738 a plain string or a Seq (like) object, the new SeqRecord will simply 739 have this appended to the existing data. However, any per letter 740 annotation will be lost: 741 742 >>> from Bio import SeqIO 743 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 744 >>> print("%s %s" % (record.id, record.seq)) 745 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 746 >>> print(list(record.letter_annotations)) 747 ['solexa_quality'] 748 749 >>> new = record + "ACT" 750 >>> print("%s %s" % (new.id, new.seq)) 751 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 752 >>> print(list(new.letter_annotations)) 753 [] 754 755 The new record will attempt to combine the annotation, but for any 756 ambiguities (e.g. different names) it defaults to omitting that 757 annotation. 758 759 >>> from Bio import SeqIO 760 >>> with open("GenBank/pBAD30.gb") as handle: 761 ... plasmid = SeqIO.read(handle, "gb") 762 >>> print("%s %i" % (plasmid.id, len(plasmid))) 763 pBAD30 4923 764 765 Now let's cut the plasmid into two pieces, and join them back up the 766 other way round (i.e. shift the starting point on this plasmid, have 767 a look at the annotated features in the original file to see why this 768 particular split point might make sense): 769 770 >>> left = plasmid[:3765] 771 >>> right = plasmid[3765:] 772 >>> new = right + left 773 >>> print("%s %i" % (new.id, len(new))) 774 pBAD30 4923 775 >>> str(new.seq) == str(right.seq + left.seq) 776 True 777 >>> len(new.features) == len(left.features) + len(right.features) 778 True 779 780 When we add the left and right SeqRecord objects, their annotation 781 is all consistent, so it is all conserved in the new SeqRecord: 782 783 >>> new.id == left.id == right.id == plasmid.id 784 True 785 >>> new.name == left.name == right.name == plasmid.name 786 True 787 >>> new.description == plasmid.description 788 True 789 >>> new.annotations == left.annotations == right.annotations 790 True 791 >>> new.letter_annotations == plasmid.letter_annotations 792 True 793 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 794 True 795 796 However, we should point out that when we sliced the SeqRecord, 797 any annotations dictionary or dbxrefs list entries were lost. 798 You can explicitly copy them like this: 799 800 >>> new.annotations = plasmid.annotations.copy() 801 >>> new.dbxrefs = plasmid.dbxrefs[:] 802 """ 803 if not isinstance(other, SeqRecord): 804 #Assume it is a string or a Seq. 805 #Note can't transfer any per-letter-annotations 806 return SeqRecord(self.seq + other, 807 id = self.id, name = self.name, 808 description = self.description, 809 features = self.features[:], 810 annotations = self.annotations.copy(), 811 dbxrefs = self.dbxrefs[:]) 812 #Adding two SeqRecord objects... must merge annotation. 813 answer = SeqRecord(self.seq + other.seq, 814 features = self.features[:], 815 dbxrefs = self.dbxrefs[:]) 816 #Will take all the features and all the db cross refs, 817 l = len(self) 818 for f in other.features: 819 answer.features.append(f._shift(l)) 820 del l 821 for ref in other.dbxrefs: 822 if ref not in answer.dbxrefs: 823 answer.dbxrefs.append(ref) 824 #Take common id/name/description/annotation 825 if self.id == other.id: 826 answer.id = self.id 827 if self.name == other.name: 828 answer.name = self.name 829 if self.description == other.description: 830 answer.description = self.description 831 for k, v in self.annotations.items(): 832 if k in other.annotations and other.annotations[k] == v: 833 answer.annotations[k] = v 834 #Can append matching per-letter-annotation 835 for k, v in self.letter_annotations.items(): 836 if k in other.letter_annotations: 837 answer.letter_annotations[k] = v + other.letter_annotations[k] 838 return answer
839
840 - def __radd__(self, other):
841 """Add another sequence or string to this sequence (from the left). 842 843 This method handles adding a Seq object (or similar, e.g. MutableSeq) 844 or a plain Python string (on the left) to a SeqRecord (on the right). 845 See the __add__ method for more details, but for example: 846 847 >>> from Bio import SeqIO 848 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 849 >>> print("%s %s" % (record.id, record.seq)) 850 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 851 >>> print(list(record.letter_annotations)) 852 ['solexa_quality'] 853 854 >>> new = "ACT" + record 855 >>> print("%s %s" % (new.id, new.seq)) 856 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 857 >>> print(list(new.letter_annotations)) 858 [] 859 """ 860 if isinstance(other, SeqRecord): 861 raise RuntimeError("This should have happened via the __add__ of " 862 "the other SeqRecord being added!") 863 #Assume it is a string or a Seq. 864 #Note can't transfer any per-letter-annotations 865 offset = len(other) 866 return SeqRecord(other + self.seq, 867 id = self.id, name = self.name, 868 description = self.description, 869 features = [f._shift(offset) for f in self.features], 870 annotations = self.annotations.copy(), 871 dbxrefs = self.dbxrefs[:])
872
873 - def upper(self):
874 """Returns a copy of the record with an upper case sequence. 875 876 All the annotation is preserved unchanged. e.g. 877 878 >>> from Bio.Alphabet import generic_dna 879 >>> from Bio.Seq import Seq 880 >>> from Bio.SeqRecord import SeqRecord 881 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 882 ... description = "Made up for this example") 883 >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] 884 >>> print(record.upper().format("fastq")) 885 @Test Made up for this example 886 ACGTACGT 887 + 888 "#$%&'() 889 <BLANKLINE> 890 891 Naturally, there is a matching lower method: 892 893 >>> print(record.lower().format("fastq")) 894 @Test Made up for this example 895 acgtacgt 896 + 897 "#$%&'() 898 <BLANKLINE> 899 """ 900 return SeqRecord(self.seq.upper(), 901 id = self.id, name = self.name, 902 description = self.description, 903 dbxrefs = self.dbxrefs[:], 904 features = self.features[:], 905 annotations = self.annotations.copy(), 906 letter_annotations=self.letter_annotations.copy())
907
908 - def lower(self):
909 """Returns a copy of the record with a lower case sequence. 910 911 All the annotation is preserved unchanged. e.g. 912 913 >>> from Bio import SeqIO 914 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 915 >>> print(record.format("fasta")) 916 >gi|3298468|dbj|BAA31520.1| SAMIPF 917 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 918 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 919 <BLANKLINE> 920 >>> print(record.lower().format("fasta")) 921 >gi|3298468|dbj|BAA31520.1| SAMIPF 922 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 923 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 924 <BLANKLINE> 925 926 To take a more annotation rich example, 927 928 >>> from Bio import SeqIO 929 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 930 >>> len(old.features) 931 3 932 >>> new = old.lower() 933 >>> len(old.features) == len(new.features) 934 True 935 >>> old.annotations["organism"] == new.annotations["organism"] 936 True 937 >>> old.dbxrefs == new.dbxrefs 938 True 939 """ 940 return SeqRecord(self.seq.lower(), 941 id = self.id, name = self.name, 942 description = self.description, 943 dbxrefs = self.dbxrefs[:], 944 features = self.features[:], 945 annotations = self.annotations.copy(), 946 letter_annotations=self.letter_annotations.copy())
947
948 - def reverse_complement(self, id=False, name=False, description=False, 949 features=True, annotations=False, 950 letter_annotations=True, dbxrefs=False):
951 """Returns new SeqRecord with reverse complement sequence. 952 953 You can specify the returned record's id, name and description as 954 strings, or True to keep that of the parent, or False for a default. 955 956 You can specify the returned record's features with a list of 957 SeqFeature objects, or True to keep that of the parent, or False to 958 omit them. The default is to keep the original features (with the 959 strand and locations adjusted). 960 961 You can also specify both the returned record's annotations and 962 letter_annotations as dictionaries, True to keep that of the parent, 963 or False to omit them. The default is to keep the original 964 annotations (with the letter annotations reversed). 965 966 To show what happens to the pre-letter annotations, consider an 967 example Solexa variant FASTQ file with a single entry, which we'll 968 read in as a SeqRecord: 969 970 >>> from Bio import SeqIO 971 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 972 >>> print("%s %s" % (record.id, record.seq)) 973 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 974 >>> print(list(record.letter_annotations)) 975 ['solexa_quality'] 976 >>> print(record.letter_annotations["solexa_quality"]) 977 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 978 979 Now take the reverse complement, 980 981 >>> rc_record = record.reverse_complement(id=record.id+"_rc") 982 >>> print("%s %s" % (rc_record.id, rc_record.seq)) 983 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 984 985 Notice that the per-letter-annotations have also been reversed, 986 although this may not be appropriate for all cases. 987 988 >>> print(rc_record.letter_annotations["solexa_quality"]) 989 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 990 991 Now for the features, we need a different example. Parsing a GenBank 992 file is probably the easiest way to get an nice example with features 993 in it... 994 995 >>> from Bio import SeqIO 996 >>> with open("GenBank/pBAD30.gb") as handle: 997 ... plasmid = SeqIO.read(handle, "gb") 998 >>> print("%s %i" % (plasmid.id, len(plasmid))) 999 pBAD30 4923 1000 >>> plasmid.seq 1001 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1002 >>> len(plasmid.features) 1003 13 1004 1005 Now, let's take the reverse complement of this whole plasmid: 1006 1007 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1008 >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) 1009 pBAD30_rc 4923 1010 >>> rc_plasmid.seq 1011 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1012 >>> len(rc_plasmid.features) 1013 13 1014 1015 Let's compare the first CDS feature - it has gone from being the 1016 second feature (index 1) to the second last feature (index -2), its 1017 strand has changed, and the location switched round. 1018 1019 >>> print(plasmid.features[1]) 1020 type: CDS 1021 location: [1081:1960](-) 1022 qualifiers: 1023 Key: label, Value: ['araC'] 1024 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1025 Key: vntifkey, Value: ['4'] 1026 <BLANKLINE> 1027 >>> print(rc_plasmid.features[-2]) 1028 type: CDS 1029 location: [2963:3842](+) 1030 qualifiers: 1031 Key: label, Value: ['araC'] 1032 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1033 Key: vntifkey, Value: ['4'] 1034 <BLANKLINE> 1035 1036 You can check this new location, based on the length of the plasmid: 1037 1038 >>> len(plasmid) - 1081 1039 3842 1040 >>> len(plasmid) - 1960 1041 2963 1042 1043 Note that if the SeqFeature annotation includes any strand specific 1044 information (e.g. base changes for a SNP), this information is not 1045 ammended, and would need correction after the reverse complement. 1046 1047 Note trying to reverse complement a protein SeqRecord raises an 1048 exception: 1049 1050 >>> from Bio.SeqRecord import SeqRecord 1051 >>> from Bio.Seq import Seq 1052 >>> from Bio.Alphabet import IUPAC 1053 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1054 >>> protein_rec.reverse_complement() 1055 Traceback (most recent call last): 1056 ... 1057 ValueError: Proteins do not have complements! 1058 1059 Also note you can reverse complement a SeqRecord using a MutableSeq: 1060 1061 >>> from Bio.SeqRecord import SeqRecord 1062 >>> from Bio.Seq import MutableSeq 1063 >>> from Bio.Alphabet import generic_dna 1064 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1065 >>> rec.seq[0] = "T" 1066 >>> print("%s %s" % (rec.id, rec.seq)) 1067 Test TCGT 1068 >>> rc = rec.reverse_complement(id=True) 1069 >>> print("%s %s" % (rc.id, rc.seq)) 1070 Test ACGA 1071 """ 1072 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1073 if isinstance(self.seq, MutableSeq): 1074 #Currently the MutableSeq reverse complement is in situ 1075 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1076 else: 1077 answer = SeqRecord(self.seq.reverse_complement()) 1078 if isinstance(id, basestring): 1079 answer.id = id 1080 elif id: 1081 answer.id = self.id 1082 if isinstance(name, basestring): 1083 answer.name = name 1084 elif name: 1085 answer.name = self.name 1086 if isinstance(description, basestring): 1087 answer.description = description 1088 elif description: 1089 answer.description = self.description 1090 if isinstance(dbxrefs, list): 1091 answer.dbxrefs = dbxrefs 1092 elif dbxrefs: 1093 #Copy the old dbxrefs 1094 answer.dbxrefs = self.dbxrefs[:] 1095 if isinstance(features, list): 1096 answer.features = features 1097 elif features: 1098 #Copy the old features, adjusting location and string 1099 l = len(answer) 1100 answer.features = [f._flip(l) for f in self.features] 1101 #The old list should have been sorted by start location, 1102 #reversing it will leave it sorted by what is now the end position, 1103 #so we need to resort in case of overlapping features. 1104 #NOTE - In the common case of gene before CDS (and similar) with 1105 #the exact same locations, this will still maintain gene before CDS 1106 answer.features.sort(key=lambda x: x.location.start.position) 1107 if isinstance(annotations, dict): 1108 answer.annotations = annotations 1109 elif annotations: 1110 #Copy the old annotations, 1111 answer.annotations = self.annotations.copy() 1112 if isinstance(letter_annotations, dict): 1113 answer.letter_annotations = letter_annotations 1114 elif letter_annotations: 1115 #Copy the old per letter annotations, reversing them 1116 for key, value in self.letter_annotations.items(): 1117 answer._per_letter_annotations[key] = value[::-1] 1118 return answer
1119 1120 1121 if __name__ == "__main__": 1122 from Bio._utils import run_doctest 1123 run_doctest() 1124